sessionInfo() #provides list of loaded packages and version of R.
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.0
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.35 R6_2.5.1 fastmap_1.1.1 xfun_0.43
## [5] cachem_1.0.8 knitr_1.45 htmltools_0.5.8.1 rmarkdown_2.26
## [9] lifecycle_1.0.4 cli_3.6.2 sass_0.4.9 jquerylib_0.1.4
## [13] compiler_4.3.2 rstudioapi_0.16.0 tools_4.3.2 evaluate_0.23
## [17] bslib_0.7.0 yaml_2.3.8 rlang_1.1.3 jsonlite_1.8.8
#BiocManager::install("simplifyEnrichment")
First, load the necessary packages.
library(goseq)
## Loading required package: BiasedUrn
## Loading required package: geneLenDataBase
##
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats)
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(tidyr)
library(grDevices)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(ggpubr)
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
##
## mutate
library(tibble)
library(gridExtra)
library(tidyr)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.16.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(circlize)
## ========================================
## circlize version 0.4.16
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
library(GSEABase)
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: annotate
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:plyr':
##
## rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:geneLenDataBase':
##
## unfactor
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:plyr':
##
## desc
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: XML
## Loading required package: graph
##
## Attaching package: 'graph'
## The following object is masked from 'package:XML':
##
## addNode
## The following object is masked from 'package:circlize':
##
## degree
## The following object is masked from 'package:plyr':
##
## join
library(data.table)
##
## Attaching package: 'data.table'
## The following object is masked from 'package:IRanges':
##
## shift
## The following objects are masked from 'package:S4Vectors':
##
## first, second
## The following objects are masked from 'package:zoo':
##
## yearmon, yearqtr
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(stringr)
##
## Attaching package: 'stringr'
## The following object is masked from 'package:graph':
##
## boundary
library(GenomicRanges)
## Loading required package: GenomeInfoDb
library(rtracklayer)
library(rrvgo)
library(simplifyEnrichment)
##
## ========================================
## simplifyEnrichment version 1.10.0
## Bioconductor page: https://bioconductor.org/packages/simplifyEnrichment/
## Github page: https://github.com/jokergoo/simplifyEnrichment
## Documentation: https://jokergoo.github.io/simplifyEnrichment/
## Examples: https://simplifyenrichment.github.io/
##
## If you use it in published research, please cite:
## Gu, Z. simplifyEnrichment: an R/Bioconductor package for Clustering and
## Visualizing Functional Enrichment Results, Genomics, Proteomics &
## Bioinformatics 2022.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(simplifyEnrichment))
## ========================================
library(scales)
library(rtracklayer)
gff<-rtracklayer::import("../../data/Pocillopora_acuta_HIv2.genes_fixed.gff3")
gff<-as.data.frame(gff) %>% dplyr::select(-Parent)
dim(gff) # 478988 9
## [1] 478988 12
names(gff)
## [1] "seqnames" "start" "end" "width"
## [5] "strand" "source" "type" "score"
## [9] "phase" "ID" "transcript_id" "gene_id"
transcripts <- subset(gff, type == "transcript")
transcripts_gr <- makeGRangesFromDataFrame(transcripts, keep.extra.columns=TRUE) #extract length information
transcript_lengths <- width(transcripts_gr) #isolate length of each gene
seqnames<-transcripts_gr$ID #extract list of gene id
lengths<-cbind(seqnames, transcript_lengths)
lengths<-as.data.frame(lengths) #convert to data frame
dim(transcripts) #33730 13
## [1] 33730 12
kegg <- read.delim("../../data/Pocillopora_acuta_HIv2.genes.KEGG_results.txt",header = FALSE)
kegg <- as.data.frame(kegg)
colnames(kegg)[1] <- "gene_id"
colnames(kegg)[2] <- "KEGG_new"
head(kegg)
## gene_id KEGG_new
## 1 Pocillopora_acuta_HIv2___RNAseq.g24143.t1a
## 2 Pocillopora_acuta_HIv2___RNAseq.g24143.t1b K22584
## 3 Pocillopora_acuta_HIv2___RNAseq.g22918.t1
## 4 Pocillopora_acuta_HIv2___RNAseq.g18333.t1 K03386
## 5 Pocillopora_acuta_HIv2___RNAseq.g7985.t1
## 6 Pocillopora_acuta_HIv2___RNAseq.g13511.t1
eggnog<-read.delim("../../data/Pocillopora_acuta_HIv2.genes.EggNog_results.txt")#this file contains all of the go terms, descriptions, kegg, etc
eggnog<- plyr::rename(eggnog, c("X.query"="gene_id"))
head(eggnog,2)
## gene_id seed_ortholog evalue score
## 1 Pocillopora_acuta_HIv2___RNAseq.g24143.t1a 45351.EDO48725 2.16e-120 364
## 2 Pocillopora_acuta_HIv2___RNAseq.g18333.t1 45351.EDO38694 3.18e-123 355
## eggNOG_OGs
## 1 COG0620@1|root,KOG2263@2759|Eukaryota,38GZS@33154|Opisthokonta,3BNKS@33208|Metazoa
## 2 COG0450@1|root,KOG0852@2759|Eukaryota,38B9P@33154|Opisthokonta,3BGS4@33208|Metazoa
## max_annot_lvl COG_category
## 1 33208|Metazoa E
## 2 33208|Metazoa O
## Description Preferred_name
## 1 Cobalamin-independent synthase, Catalytic domain -
## 2 negative regulation of male germ cell proliferation PRDX4
## GOs
## 1 -
## 2 GO:0000003,GO:0001775,GO:0002252,GO:0002263,GO:0002274,GO:0002275,GO:0002283,GO:0002366,GO:0002376,GO:0002443,GO:0002444,GO:0002446,GO:0003006,GO:0003674,GO:0003824,GO:0004601,GO:0005488,GO:0005515,GO:0005575,GO:0005576,GO:0005615,GO:0005622,GO:0005623,GO:0005737,GO:0005783,GO:0005790,GO:0005829,GO:0006082,GO:0006457,GO:0006464,GO:0006468,GO:0006520,GO:0006575,GO:0006793,GO:0006796,GO:0006807,GO:0006810,GO:0006887,GO:0006915,GO:0006950,GO:0006952,GO:0006955,GO:0006979,GO:0007154,GO:0007165,GO:0007249,GO:0007252,GO:0007275,GO:0007276,GO:0007283,GO:0007548,GO:0008150,GO:0008152,GO:0008219,GO:0008285,GO:0008379,GO:0008406,GO:0008584,GO:0009056,GO:0009266,GO:0009409,GO:0009605,GO:0009607,GO:0009617,GO:0009628,GO:0009636,GO:0009893,GO:0009966,GO:0009967,GO:0009987,GO:0010467,GO:0010604,GO:0010646,GO:0010647,GO:0010941,GO:0010942,GO:0010950,GO:0010952,GO:0012501,GO:0012505,GO:0016043,GO:0016192,GO:0016209,GO:0016310,GO:0016491,GO:0016684,GO:0016999,GO:0017001,GO:0017144,GO:0019222,GO:0019471,GO:0019538,GO:0019725,GO:0019752,GO:0019953,GO:0022414,GO:0022417,GO:0023051,GO:0023052,GO:0023056,GO:0030141,GO:0030162,GO:0030198,GO:0031323,GO:0031325,GO:0031410,GO:0031974,GO:0031982,GO:0031983,GO:0032268,GO:0032270,GO:0032501,GO:0032502,GO:0032504,GO:0032940,GO:0033554,GO:0034774,GO:0035556,GO:0036211,GO:0036230,GO:0042119,GO:0042127,GO:0042221,GO:0042592,GO:0042737,GO:0042742,GO:0042743,GO:0042744,GO:0042802,GO:0042803,GO:0042981,GO:0043062,GO:0043065,GO:0043067,GO:0043068,GO:0043085,GO:0043170,GO:0043207,GO:0043226,GO:0043227,GO:0043229,GO:0043231,GO:0043233,GO:0043280,GO:0043281,GO:0043299,GO:0043312,GO:0043412,GO:0043436,GO:0043900,GO:0043901,GO:0044093,GO:0044237,GO:0044238,GO:0044248,GO:0044260,GO:0044267,GO:0044281,GO:0044421,GO:0044422,GO:0044424,GO:0044433,GO:0044444,GO:0044446,GO:0044464,GO:0044703,GO:0045055,GO:0045137,GO:0045321,GO:0045454,GO:0045862,GO:0046425,GO:0046427,GO:0046546,GO:0046661,GO:0046903,GO:0046983,GO:0048232,GO:0048513,GO:0048518,GO:0048519,GO:0048522,GO:0048523,GO:0048583,GO:0048584,GO:0048608,GO:0048609,GO:0048731,GO:0048856,GO:0050789,GO:0050790,GO:0050794,GO:0050896,GO:0051171,GO:0051173,GO:0051179,GO:0051186,GO:0051187,GO:0051234,GO:0051239,GO:0051241,GO:0051246,GO:0051247,GO:0051336,GO:0051345,GO:0051604,GO:0051704,GO:0051707,GO:0051716,GO:0051920,GO:0052547,GO:0052548,GO:0055114,GO:0060205,GO:0060255,GO:0061458,GO:0065007,GO:0065008,GO:0065009,GO:0070013,GO:0070417,GO:0070887,GO:0071704,GO:0071840,GO:0072593,GO:0080090,GO:0097190,GO:0097237,GO:0097708,GO:0098542,GO:0098754,GO:0098869,GO:0099503,GO:0101002,GO:1901564,GO:1901605,GO:1902531,GO:1902533,GO:1904813,GO:1904892,GO:1904894,GO:1905936,GO:1905937,GO:1990748,GO:2000116,GO:2000241,GO:2000242,GO:2000254,GO:2000255,GO:2001056,GO:2001233,GO:2001235,GO:2001267,GO:2001269
## EC KEGG_ko
## 1 2.1.1.14 ko:K00549
## 2 1.11.1.15 ko:K03386
## KEGG_Pathway
## 1 ko00270,ko00450,ko01100,ko01110,ko01230,map00270,map00450,map01100,map01110,map01230
## 2 ko04214,map04214
## KEGG_Module KEGG_Reaction KEGG_rclass
## 1 M00017 R04405,R09365 RC00035,RC00113,RC01241
## 2 - - -
## BRITE KEGG_TC CAZy BiGG_Reaction
## 1 ko00000,ko00001,ko00002,ko01000 - - -
## 2 ko00000,ko00001,ko01000,ko04147 - - -
## PFAMs
## 1 Meth_synt_2
## 2 1-cysPrx_C,AhpC-TSA
gogene <- merge(transcripts, eggnog, by=c("gene_id"), all=T)
gogene <- merge(gogene, kegg, by=c("gene_id"), all=T)
head(gogene,2)
## gene_id seqnames
## 1 Pocillopora_acuta_HIv2___RNAseq.10002_t Pocillopora_acuta_HIv2___Sc0000013
## 2 Pocillopora_acuta_HIv2___RNAseq.10010_t Pocillopora_acuta_HIv2___Sc0000013
## start end width strand source type score.x phase
## 1 4542087 4551503 9417 + AUGUSTUS transcript NA NA
## 2 4639103 4647350 8248 + AUGUSTUS transcript NA NA
## ID
## 1 Pocillopora_acuta_HIv2___RNAseq.10002_t
## 2 Pocillopora_acuta_HIv2___RNAseq.10010_t
## transcript_id seed_ortholog evalue score.y
## 1 Pocillopora_acuta_HIv2___RNAseq.10002_t 45351.EDO27354 2.41e-93 317
## 2 Pocillopora_acuta_HIv2___RNAseq.10010_t 6087.XP_002166004.2 1.28e-38 164
## eggNOG_OGs
## 1 COG0666@1|root,KOG0510@2759|Eukaryota,38G7Q@33154|Opisthokonta,3BCDU@33208|Metazoa
## 2 COG0666@1|root,KOG4177@2759|Eukaryota
## max_annot_lvl COG_category Description
## 1 33208|Metazoa DZ osmolarity-sensing cation channel activity
## 2 2759|Eukaryota I spectrin binding
## Preferred_name
## 1 TRPA1
## 2 -
## GOs
## 1 GO:0000302,GO:0001580,GO:0002791,GO:0002793,GO:0003008,GO:0003012,GO:0003674,GO:0004888,GO:0005034,GO:0005215,GO:0005216,GO:0005217,GO:0005244,GO:0005245,GO:0005261,GO:0005262,GO:0005488,GO:0005515,GO:0005575,GO:0005623,GO:0005886,GO:0005887,GO:0006810,GO:0006811,GO:0006812,GO:0006816,GO:0006873,GO:0006874,GO:0006875,GO:0006936,GO:0006939,GO:0006950,GO:0006979,GO:0007154,GO:0007165,GO:0007166,GO:0007204,GO:0007600,GO:0007602,GO:0007606,GO:0007610,GO:0007638,GO:0008150,GO:0008324,GO:0009266,GO:0009314,GO:0009408,GO:0009409,GO:0009410,GO:0009416,GO:0009453,GO:0009581,GO:0009582,GO:0009583,GO:0009593,GO:0009605,GO:0009612,GO:0009628,GO:0009636,GO:0009719,GO:0009966,GO:0009967,GO:0009987,GO:0010033,GO:0010035,GO:0010037,GO:0010243,GO:0010378,GO:0010646,GO:0010647,GO:0010817,GO:0014070,GO:0014074,GO:0014832,GO:0014848,GO:0015075,GO:0015085,GO:0015267,GO:0015276,GO:0015278,GO:0015318,GO:0016020,GO:0016021,GO:0016043,GO:0016048,GO:0016324,GO:0019233,GO:0019722,GO:0019725,GO:0019932,GO:0022607,GO:0022803,GO:0022832,GO:0022834,GO:0022836,GO:0022838,GO:0022839,GO:0022843,GO:0022857,GO:0022890,GO:0023041,GO:0023051,GO:0023052,GO:0023056,GO:0030001,GO:0030003,GO:0030424,GO:0031000,GO:0031224,GO:0031226,GO:0031644,GO:0031646,GO:0032024,GO:0032421,GO:0032501,GO:0032879,GO:0032880,GO:0032991,GO:0033554,GO:0033555,GO:0034220,GO:0034605,GO:0034702,GO:0034703,GO:0035556,GO:0035690,GO:0035774,GO:0036270,GO:0038023,GO:0040011,GO:0040040,GO:0042221,GO:0042330,GO:0042331,GO:0042391,GO:0042493,GO:0042542,GO:0042592,GO:0042752,GO:0042802,GO:0042995,GO:0043005,GO:0043052,GO:0043269,GO:0043270,GO:0043279,GO:0043933,GO:0044057,GO:0044070,GO:0044085,GO:0044425,GO:0044459,GO:0044464,GO:0045177,GO:0046677,GO:0046873,GO:0046883,GO:0046887,GO:0046957,GO:0048265,GO:0048518,GO:0048519,GO:0048522,GO:0048523,GO:0048583,GO:0048584,GO:0048878,GO:0050708,GO:0050714,GO:0050789,GO:0050794,GO:0050796,GO:0050801,GO:0050848,GO:0050850,GO:0050877,GO:0050896,GO:0050906,GO:0050907,GO:0050909,GO:0050912,GO:0050913,GO:0050951,GO:0050954,GO:0050955,GO:0050960,GO:0050961,GO:0050965,GO:0050966,GO:0050968,GO:0050974,GO:0050982,GO:0051046,GO:0051047,GO:0051049,GO:0051050,GO:0051179,GO:0051209,GO:0051222,GO:0051223,GO:0051234,GO:0051239,GO:0051240,GO:0051259,GO:0051260,GO:0051262,GO:0051282,GO:0051283,GO:0051289,GO:0051480,GO:0051606,GO:0051641,GO:0051649,GO:0051716,GO:0051930,GO:0051931,GO:0051969,GO:0052129,GO:0055065,GO:0055074,GO:0055080,GO:0055082,GO:0055085,GO:0060089,GO:0060341,GO:0060401,GO:0060402,GO:0061178,GO:0065003,GO:0065007,GO:0065008,GO:0070201,GO:0070417,GO:0070588,GO:0070838,GO:0070887,GO:0071241,GO:0071244,GO:0071310,GO:0071312,GO:0071313,GO:0071407,GO:0071415,GO:0071417,GO:0071466,GO:0071495,GO:0071840,GO:0071944,GO:0072347,GO:0072503,GO:0072507,GO:0072511,GO:0090087,GO:0090276,GO:0090277,GO:0097458,GO:0097553,GO:0097603,GO:0097604,GO:0098590,GO:0098655,GO:0098660,GO:0098662,GO:0098771,GO:0098796,GO:0098862,GO:0098900,GO:0098908,GO:0099094,GO:0099604,GO:0120025,GO:1901698,GO:1901699,GO:1901700,GO:1901701,GO:1902495,GO:1902531,GO:1902533,GO:1903522,GO:1903530,GO:1903532,GO:1903793,GO:1904058,GO:1904951,GO:1990351,GO:1990760
## 2 -
## EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass
## 1 - ko:K04984 ko04750,map04750 - - -
## 2 - - - - - -
## BRITE KEGG_TC CAZy
## 1 ko00000,ko00001,ko04040 1.A.4.6.1,1.A.4.6.2,1.A.4.6.3,1.A.4.6.5 -
## 2 - - -
## BiGG_Reaction PFAMs KEGG_new
## 1 - Ank,Ank_2,Ank_3,Ank_4,Ion_trans K04984
## 2 - Ank,Ank_2,Ank_4,Ank_5,Ion_trans K04984
dim(gogene)
## [1] 33730 33
geneInfo <- read.csv("../../output/WGCNA/WGCNA_ModuleMembership.csv") #this file was generated from the WGCNA analyses and contains the modules of interest
geneInfo<- plyr::rename(geneInfo, c("X"="gene_id"))
dim(geneInfo) # there are 9012 genes in our gene info file
## [1] 9012 40
geneInfo <- merge(gogene, geneInfo, by=c("gene_id")) #merging the GO and Kegg info to module membership for the 9012 genes
Format GO terms to remove dashes and quotes and separate by semicolons (replace , with ;) in GOs column
geneInfo$GOs <- gsub(",", ";", geneInfo$GOs)
geneInfo$GOs <- gsub('"', "", geneInfo$GOs)
geneInfo$GOs <- gsub("-", NA, geneInfo$GOs)
geneInfo$KEGG_new[geneInfo$KEGG_new == ""] <- NA
unique(geneInfo$moduleColor)
## [1] "green" "blue" "salmon" "turquoise" "yellow"
## [6] "black" "red" "magenta" "lightcyan" "purple"
## [11] "brown" "pink" "midnightblue" "tan" "cyan"
geneInfo$Length<-lengths$transcript_lengths[match(geneInfo$gene_id, lengths$seqnames)]
dim(geneInfo)
## [1] 9012 73
write.csv(geneInfo, file = "../../output/WGCNA/GO_analysis/geneInfo_WGCNA.csv") #gene info for reference/supplement
calc_up_mods <- c("brown", "red", "black", "pink", "salmon", "blue")
nrow(geneInfo %>% dplyr::filter(moduleColor=="brown")) #942
## [1] 942
nrow(geneInfo %>% dplyr::filter(moduleColor=="red")) #425
## [1] 425
nrow(geneInfo %>% filter(moduleColor=="black")) #396
## [1] 396
nrow(geneInfo %>% filter(moduleColor=="pink")) #220
## [1] 220
nrow(geneInfo %>% filter(moduleColor=="salmon")) #154
## [1] 154
nrow(geneInfo %>% filter(moduleColor=="blue")) #1989
## [1] 1989
sum(nrow(geneInfo %>% dplyr::filter(moduleColor=="brown")), nrow(geneInfo %>% dplyr::filter(moduleColor=="red")), nrow(geneInfo %>% filter(moduleColor=="black")), nrow(geneInfo %>% filter(moduleColor=="pink")), nrow(geneInfo %>% filter(moduleColor=="salmon")), nrow(geneInfo %>% filter(moduleColor=="blue")))
## [1] 4126
# 4126
calc_down_mods <- c("turquoise","magenta","lightcyan")
nrow(geneInfo %>% dplyr::filter(moduleColor=="turquoise")) #2558
## [1] 2558
nrow(geneInfo %>% dplyr::filter(moduleColor=="magenta")) #219
## [1] 219
nrow(geneInfo %>% filter(moduleColor=="lightcyan")) #65
## [1] 65
sum(nrow(geneInfo %>% dplyr::filter(moduleColor=="turquoise")), nrow(geneInfo %>% dplyr::filter(moduleColor=="magenta")), nrow(geneInfo %>% filter(moduleColor=="lightcyan")))
## [1] 2842
# 2842
other_mods <- c("green","yellow", "purple", "midnightblue","cyan","tan")
sum(nrow(geneInfo %>% dplyr::filter(moduleColor=="green")), nrow(geneInfo %>% dplyr::filter(moduleColor=="yellow")), nrow(geneInfo %>% filter(moduleColor=="purple")), nrow(geneInfo %>% filter(moduleColor=="midnightblue")), nrow(geneInfo %>% filter(moduleColor=="cyan")),nrow(geneInfo %>% filter(moduleColor=="tan")))
## [1] 2044
# 2044
# 4126 + 2842 + 2044 = 9012, which represents all of our genes
### Generate vector with names of all genes
ALL.vector <- c(geneInfo$gene_id)
### Generate length vector for all genes
LENGTH.vector <- as.integer(geneInfo$Length)
4126 genes are in the 6 modules significantly upregulated by calcification.
### Generate vector with names in just the module we are analyzing
# ID.vector <- geneInfo %>%
# filter(moduleColor==c("brown", "red", "black", "pink", "salmon", "green")) %>%
# #get_rows(.data[[module]]))%>%
# pull(gene_id)
ID.vector <- geneInfo %>%
filter(moduleColor %in% c("brown", "red", "black", "pink", "salmon", "blue")) %>%
pull(gene_id)
length(ID.vector) #4126
## [1] 4126
##Get a list of GO Terms for the all 9012 genes
GO.terms <- geneInfo %>%
dplyr::select(gene_id, GOs) %>% rename(GOs = "GO.terms")
dim(GO.terms) #9012 2
## [1] 9012 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$GO.terms), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "GO.terms")
GO.terms<-split2
##Construct list of genes with 1 for genes in module and 0 for genes not in the module
gene.vector=as.integer(ALL.vector %in% ID.vector)
names(gene.vector)<-ALL.vector#set names
#weight gene vector by bias for length of gene
pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector)
## Warning in pcls(G): initial point very close to some inequality constraints
rrvgo package to
reduce redundancy in list of GO terms.#run goseq using Wallenius method for all categories of GO terms
GO.wall<-goseq(pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:BP", "GO:MF", "GO:CC"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#Filtering for p < 0.05
GO_05 <- GO %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue)
#Write file of results
write.csv(GO_05, file = "../../output/WGCNA/GO_analysis/goseq_pattern_calcification.csv")
GO_05 <- read.csv("../../output/WGCNA/GO_analysis/goseq_pattern_calcification.csv")
go_results_BP <- GO_05 %>%
filter(ontology=="BP")%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
dim(go_results_BP)
## [1] 654 8
head(go_results_BP)
## X GOterm over_represented_pvalue under_represented_pvalue numDEInCat
## 1 1 GO:0050794 8.436918e-08 1.0000000 1395
## 2 2 GO:0065007 1.553493e-06 0.9999989 1574
## 3 3 GO:0050789 1.584882e-06 0.9999989 1479
## 4 4 GO:0048523 1.581326e-05 0.9999878 800
## 5 5 GO:0048518 3.043369e-05 0.9999758 950
## 6 6 GO:0051336 3.403563e-05 0.9999783 232
## numInCat term ontology
## 1 2785 regulation of cellular process BP
## 2 3184 biological regulation BP
## 3 2981 regulation of biological process BP
## 4 1569 negative regulation of cellular process BP
## 5 1888 positive regulation of biological process BP
## 6 415 regulation of hydrolase activity BP
Collapsing with simplifyEnrichment package
library(simplifyEnrichment)
BP_terms <- go_results_BP$GOterm
Mat <- GO_similarity(BP_terms, ont = "BP", db = "org.Ce.eg.db")
#compare_clustering_methods(Mat, plot_type = "heatmap")
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_up_mods_GOSEM.pdf", width = 7, height = 5)
simplifyGO(Mat, word_cloud_grob_param = list(max_width = 50), max_words=20)
## Cluster 654 terms by 'binary_cut'... 184 clusters, used 2.056766 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 6 GO lists...
dev.off()
## quartz_off_screen
## 2
CalcUpMods_GO_P05 <- go_results_BP %>%
mutate(term = fct_reorder(term, over_represented_pvalue, .desc = TRUE)) %>%
ggplot(aes(x=term, y=over_represented_pvalue) ) +
geom_segment(aes(x=term ,xend=term, y=0, yend=over_represented_pvalue), color="grey") +
geom_point(size=1, color="#69b3a2") +
coord_flip() +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
legend.position="none"
) +
xlab("") +
ylab("over_represented_pvalue") +
ggtitle("Biological Process GO Terms Enriched in Modules Upregulated in Calcification") + #add a main title
theme(plot.title = element_text(face = 'bold',
size = 3,
hjust = 0)) +
theme_bw() + #Set background color
theme(panel.border = element_blank(), # Set border
panel.grid.major = element_blank(), #Set major gridlines
panel.grid.minor = element_blank(), #Set minor gridlines
axis.line = element_line(colour = "black"), #Set axes color
axis.text.y = element_text(size = 2), #set y axis labels size
plot.background=element_blank(),#Set the plot background
legend.position="none")
CalcUpMods_GO_P05
ggsave("../../output/WGCNA/GO_analysis/CalcUpMods_GO_P05.pdf", CalcUpMods_GO_P05, width = 12, height = 36, units = c("in"), dpi=300, limitsize=FALSE)
library(rrvgo)
#Reduce/collapse GO term set with the rrvgo package
simMatrix <- calculateSimMatrix(go_results_BP$GOterm,
orgdb="org.Ce.eg.db", #c. elegans database
ont="BP",
method="Rel")
## preparing gene to GO mapping data...
## preparing IC data...
#calculate similarity
scores <- setNames(-log(go_results_BP$over_represented_pvalue), go_results_BP$GOterm)
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold=0.7,
orgdb="org.Ce.eg.db")
dim(reducedTerms)
## [1] 508 10
#keep only the goterms from the reduced list
go_results_BP_reduced <- go_results_BP %>%
filter(GOterm %in% reducedTerms$go)
#add in parent terms to list of go terms
go_results_BP_reduced$ParentTerm <- reducedTerms$parentTerm[match(go_results_BP_reduced$GOterm, reducedTerms$go)]
length(unique(go_results_BP_reduced$ParentTerm))
## [1] 73
The reduced list of terms is 508 terms that falls under 73 parent terms.
write.csv(go_results_BP_reduced, "../../output/WGCNA/GO_analysis/goseq_pattern_calcification_filtered.csv")
Plot reduced terms
#plot significantly enriched GO terms by Slim Category faceted by slim term
GO.plot_calcification <- ggplot(go_results_BP_reduced, aes(x = ontology, y = term)) +
geom_point(aes(size=over_represented_pvalue)) +
scale_size(name="Over rep. p-value", trans="reverse", range=c(1,3))+
facet_grid(ParentTerm ~ ., scales = "free", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
strip.text.y = element_text(angle=0, size = 10),
strip.text.x = element_text(size = 20),
axis.text = element_text(size = 8),
axis.title.x = element_blank(),
axis.title.y = element_blank())
GO.plot_calcification
ggsave(filename="../../output/WGCNA/GO_analysis/GO.plot_calcification.pdf", plot=GO.plot_calcification, dpi=300, height=100, units="in", limitsize=FALSE)
## Saving 7 x 100 in image
CalcUpMods_GO_P05_reduced <- go_results_BP_reduced %>%
mutate(term = fct_reorder(term, over_represented_pvalue, .desc = TRUE)) %>%
ggplot(aes(x=term, y=over_represented_pvalue) ) +
geom_segment(aes(x=term ,xend=term, y=0, yend=over_represented_pvalue), color="grey") +
geom_point(size=1, color="#69b3a2") +
coord_flip() +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
legend.position="none"
) +
xlab("") +
ylab("over_represented_pvalue") +
ggtitle("Biological Process GO Terms Enriched in Modules Upregulated in Calcification") + #add a main title
theme(plot.title = element_text(face = 'bold',
size = 3,
hjust = 0)) +
theme_bw() + #Set background color
theme(panel.border = element_blank(), # Set border
panel.grid.major = element_blank(), #Set major gridlines
panel.grid.minor = element_blank(), #Set minor gridlines
axis.line = element_line(colour = "black"), #Set axes color
axis.text.y = element_text(size = 2), #set y axis labels size
plot.background=element_blank(),#Set the plot background
legend.position="none")
CalcUpMods_GO_P05_reduced
ggsave("../../output/WGCNA/GO_analysis/CalcUpMods_GO_P05_reduced.pdf", CalcUpMods_GO_P05_reduced, width = 12, height = 36, units = c("in"), dpi=300, limitsize=FALSE)
### Generate vector with names of all genes
ALL.vector <- c(geneInfo$gene_id)
### Generate length vector for all genes
LENGTH.vector <- as.integer(geneInfo$Length)
ID.vector <- geneInfo %>%
filter(moduleColor %in% c("brown")) %>%
pull(gene_id)
length(ID.vector) #942
## [1] 942
##Get a list of GO Terms for the all 9012 genes
GO.terms <- geneInfo %>%
dplyr::select(gene_id, GOs) %>% rename(GOs = "GO.terms")
dim(GO.terms) #9012 2
## [1] 9012 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$GO.terms), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "GO.terms")
GO.terms<-split2
##Construct list of genes with 1 for genes in module and 0 for genes not in the module
gene.vector=as.integer(ALL.vector %in% ID.vector)
names(gene.vector)<-ALL.vector #set names
#weight gene vector by bias for length of gene
pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall<-goseq(pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:BP", "GO:MF", "GO:CC"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#Filtering for p < 0.05
GO_05_brown <- GO %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue) %>% mutate(Module = "Brown")
go_results_BP <- GO_05_brown %>%
filter(ontology=="BP")%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
dim(go_results_BP)
## [1] 206 8
Top10Plot <- go_results_BP %>%
top_n(10, wt=-over_represented_pvalue) %>%
mutate(hitsPerc=numDEInCat/numInCat) %>%
ggplot(aes(x=hitsPerc,
y=reorder(term, -hitsPerc),
colour=over_represented_pvalue,
size=numDEInCat)) +
geom_point() +
scale_colour_gradient(low = "red", high = "blue", name = "p value",guide=guide_colorbar(reverse=TRUE)) +
expand_limits(x=0) +
scale_y_discrete(labels = label_wrap(40)) +
labs(title = "Top 10 Enriched Biological Process GO Terms, Brown Module", x="Proportion of Genes in Module per GO Term", y="GO term", colour="p value", size="Count") +
theme_bw() +
theme(axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 10))
Top10Plot
ggsave("../../output/WGCNA/GO_analysis/Top10_BP_by_mod/Brown.pdf", Top10Plot, width = 5, height = 5, units = c("in"), dpi=300, limitsize=FALSE)
Collapsing with simplifyEnrichment package
library(simplifyEnrichment)
BP_terms <- go_results_BP$GOterm
Mat <- GO_similarity(BP_terms, ont = "BP", db = "org.Ce.eg.db")
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_up_mods_GOSEM_brown.pdf", width = 7, height = 5)
simplifyGO(Mat, word_cloud_grob_param = list(max_width = 50), min_term = 3, exclude_words = c("regulation", "process"), order_by_size = TRUE)
## Cluster 206 terms by 'binary_cut'... 73 clusters, used 0.3661621 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 11 GO lists...
dev.off()
## quartz_off_screen
## 2
ID.vector <- geneInfo %>%
filter(moduleColor %in% c("red")) %>%
pull(gene_id)
length(ID.vector) #425
## [1] 425
##Get a list of GO Terms for the all 9012 genes
GO.terms <- geneInfo %>%
dplyr::select(gene_id, GOs) %>% rename(GOs = "GO.terms")
dim(GO.terms) #9012 2
## [1] 9012 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$GO.terms), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "GO.terms")
GO.terms<-split2
##Construct list of genes with 1 for genes in module and 0 for genes not in the module
gene.vector=as.integer(ALL.vector %in% ID.vector)
names(gene.vector)<-ALL.vector#set names
#weight gene vector by bias for length of gene
pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall<-goseq(pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:BP", "GO:MF", "GO:CC"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#Filtering for p < 0.05
GO_05_red <- GO %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue) %>% mutate(Module = "Red")
go_results_BP <- GO_05_red %>%
filter(ontology=="BP")%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
dim(go_results_BP)
## [1] 209 8
#Write file of results
write.csv(GO_05_red, file = "../../output/WGCNA/GO_analysis/goseq_pattern_calcification_red.csv")
Top10Plot <- go_results_BP %>%
top_n(10, wt=-over_represented_pvalue) %>%
mutate(hitsPerc=numDEInCat/numInCat) %>%
ggplot(aes(x=hitsPerc,
y=reorder(term, -hitsPerc),
colour=over_represented_pvalue,
size=numDEInCat)) +
geom_point() +
scale_colour_gradient(low = "red", high = "blue", name = "p value",guide=guide_colorbar(reverse=TRUE)) +
expand_limits(x=0) +
scale_y_discrete(labels = label_wrap(40)) +
labs(title = "Top 10 Enriched Biological Process GO Terms, Red Module", x="Proportion of Genes in Module per GO Term", y="GO term", colour="p value", size="Count") +
theme_bw() +
theme(axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 10))
Top10Plot
ggsave("../../output/WGCNA/GO_analysis/Top10_BP_by_mod/Red.pdf", Top10Plot, width = 5, height = 5, units = c("in"), dpi=300, limitsize=FALSE)
Collapsing with simplifyEnrichment package
library(simplifyEnrichment)
BP_terms <- go_results_BP$GOterm
Mat <- GO_similarity(BP_terms, ont = "BP", db = "org.Ce.eg.db")
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_up_mods_GOSEM_red.pdf", width = 7, height = 5)
result <- simplifyGO(Mat, word_cloud_grob_param = list(max_width = 50), min_term = 3, exclude_words = c("regulation", "process"), order_by_size = TRUE)
## Cluster 209 terms by 'binary_cut'... 147 clusters, used 1.343836 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 9 GO lists...
result
## id cluster
## 1 GO:0035725 1
## 2 GO:0060075 2
## 3 GO:0031943 3
## 4 GO:0031944 4
## 5 GO:0031946 5
## 6 GO:0031947 6
## 7 GO:0006814 1
## 8 GO:0051941 7
## 9 GO:0051946 8
## 10 GO:1903280 9
## 11 GO:1903416 10
## 12 GO:0090032 11
## 13 GO:0060439 12
## 14 GO:0046719 13
## 15 GO:0046726 14
## 16 GO:0000707 15
## 17 GO:0000730 15
## 18 GO:0042148 15
## 19 GO:0090735 16
## 20 GO:0010894 17
## 21 GO:0086012 18
## 22 GO:0032351 19
## 23 GO:0032353 20
## 24 GO:0070212 21
## 25 GO:0090030 22
## 26 GO:0002087 23
## 27 GO:0030007 24
## 28 GO:0036376 1
## 29 GO:1903279 25
## 30 GO:0007630 26
## 31 GO:1902808 27
## 32 GO:0032287 28
## 33 GO:0046466 29
## 34 GO:0002028 1
## 35 GO:0086010 30
## 36 GO:0045823 31
## 37 GO:0000724 15
## 38 GO:0000725 15
## 39 GO:1901992 27
## 40 GO:0035158 32
## 41 GO:0044065 33
## 42 GO:0140115 1
## 43 GO:1990573 1
## 44 GO:0015701 34
## 45 GO:0045939 35
## 46 GO:0051890 36
## 47 GO:0086064 37
## 48 GO:0051055 38
## 49 GO:0098655 1
## 50 GO:1900087 27
## 51 GO:2000617 39
## 52 GO:0019377 40
## 53 GO:0007143 27
## 54 GO:0045988 41
## 55 GO:1903053 16
## 56 GO:0055119 42
## 57 GO:1901989 27
## 58 GO:0043217 43
## 59 GO:0030509 44
## 60 GO:0032042 15
## 61 GO:0071805 1
## 62 GO:0045003 15
## 63 GO:2001137 45
## 64 GO:0010248 1
## 65 GO:0002036 46
## 66 GO:0010958 47
## 67 GO:0002807 48
## 68 GO:1904357 15
## 69 GO:0046498 40
## 70 GO:0035562 49
## 71 GO:2000615 50
## 72 GO:0086004 51
## 73 GO:1903115 52
## 74 GO:0090075 53
## 75 GO:0086036 54
## 76 GO:0006883 24
## 77 GO:0098662 1
## 78 GO:0030720 55
## 79 GO:0046128 40
## 80 GO:0060438 56
## 81 GO:0030206 40
## 82 GO:1904646 57
## 83 GO:1902600 1
## 84 GO:0071772 44
## 85 GO:0071773 44
## 86 GO:0007400 58
## 87 GO:1903055 59
## 88 GO:1903789 60
## 89 GO:0006813 1
## 90 GO:0098660 1
## 91 GO:0035203 61
## 92 GO:0035204 62
## 93 GO:0045611 63
## 94 GO:0045613 64
## 95 GO:0045614 65
## 96 GO:0046885 66
## 97 GO:0060081 67
## 98 GO:1902476 1
## 99 GO:0042278 40
## 100 GO:0006023 40
## 101 GO:0045143 27
## 102 GO:0051899 30
## 103 GO:0030510 44
## 104 GO:1901879 16
## 105 GO:0045623 68
## 106 GO:0035172 69
## 107 GO:0006821 1
## 108 GO:1904645 70
## 109 GO:0035066 71
## 110 GO:0000038 40
## 111 GO:1901657 40
## 112 GO:0060391 72
## 113 GO:0051580 30
## 114 GO:0043696 73
## 115 GO:0043697 74
## 116 GO:0006302 15
## 117 GO:1990535 75
## 118 GO:0055075 24
## 119 GO:0048857 76
## 120 GO:0060413 77
## 121 GO:0007604 78
## 122 GO:1990146 79
## 123 GO:1904762 80
## 124 GO:0002213 81
## 125 GO:0009608 82
## 126 GO:0035163 83
## 127 GO:0035165 84
## 128 GO:0035622 85
## 129 GO:0042480 86
## 130 GO:0045316 87
## 131 GO:0045967 88
## 132 GO:0061073 89
## 133 GO:0061331 90
## 134 GO:0072574 91
## 135 GO:0072575 92
## 136 GO:1990705 93
## 137 GO:0051758 94
## 138 GO:0070649 95
## 139 GO:0110009 96
## 140 GO:0070213 97
## 141 GO:0003198 98
## 142 GO:0003220 99
## 143 GO:0003251 100
## 144 GO:0003348 101
## 145 GO:0060956 102
## 146 GO:0061040 103
## 147 GO:1905305 104
## 148 GO:0021550 105
## 149 GO:0021744 106
## 150 GO:0035750 107
## 151 GO:0035177 108
## 152 GO:0002143 109
## 153 GO:0032447 21
## 154 GO:0098822 110
## 155 GO:0045989 41
## 156 GO:0006432 40
## 157 GO:0060047 111
## 158 GO:0010644 112
## 159 GO:0021934 113
## 160 GO:0021935 114
## 161 GO:1902816 115
## 162 GO:1902817 116
## 163 GO:1904527 117
## 164 GO:0030497 40
## 165 GO:0042761 40
## 166 GO:0007127 27
## 167 GO:1903697 118
## 168 GO:1904441 119
## 169 GO:1904442 120
## 170 GO:1990790 121
## 171 GO:1990792 122
## 172 GO:0070407 123
## 173 GO:0043576 124
## 174 GO:0035065 71
## 175 GO:0010766 1
## 176 GO:1902306 1
## 177 GO:2000650 1
## 178 GO:0043371 125
## 179 GO:2000515 126
## 180 GO:0038166 127
## 181 GO:0048867 58
## 182 GO:0031335 128
## 183 GO:0050666 129
## 184 GO:0016233 15
## 185 GO:0002041 130
## 186 GO:0002044 131
## 187 GO:0098661 1
## 188 GO:1903976 132
## 189 GO:1904464 133
## 190 GO:1904465 134
## 191 GO:0031573 27
## 192 GO:0046666 135
## 193 GO:2000758 71
## 194 GO:0060390 136
## 195 GO:0050650 40
## 196 GO:0086009 30
## 197 GO:0086011 30
## 198 GO:0086013 137
## 199 GO:0099622 138
## 200 GO:0003283 139
## 201 GO:0006024 40
## 202 GO:0022011 140
## 203 GO:0032292 141
## 204 GO:0000002 142
## 205 GO:0045822 143
## 206 GO:0003015 144
## 207 GO:0043353 145
## 208 GO:0048822 146
## 209 GO:0045622 147
dev.off()
## quartz_off_screen
## 2
# Following filtering of small clusters is based on the simplifyGO/ht_clusters source code for merging the small clusters together
# Filter clusters smaller than 3
cluster_sizes <- table(result$cluster) # table function makes a table with the size of each cluster (freqeuncy)
large_clusters <- as.numeric(names(cluster_sizes)[cluster_sizes >= 3]) # Identify clusters with frequenct/size less than 3
filtered_indices <- result$cluster %in% large_clusters # Filter out the indices belonging to small clusters
filtered_Mat <- Mat[filtered_indices, filtered_indices] # Create a new matrix with only the indicies from larger clusters
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_up_mods_GOSEM_red_filtered.pdf", width = 7, height = 5)
simplifyGO(filtered_Mat, word_cloud_grob_param = list(max_width = 50), min_term = 1, exclude_words = c("regulation", "process"), order_by_size = TRUE)
## Cluster 68 terms by 'binary_cut'... 8 clusters, used 0.05846906 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 8 GO lists...
dev.off()
## quartz_off_screen
## 2
ID.vector <- geneInfo %>%
filter(moduleColor %in% c("black")) %>%
pull(gene_id)
length(ID.vector) #396
## [1] 396
##Get a list of GO Terms for the all 9012 genes
GO.terms <- geneInfo %>%
dplyr::select(gene_id, GOs) %>% rename(GOs = "GO.terms")
dim(GO.terms) #9012 2
## [1] 9012 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$GO.terms), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "GO.terms")
GO.terms<-split2
##Construct list of genes with 1 for genes in module and 0 for genes not in the module
gene.vector=as.integer(ALL.vector %in% ID.vector)
names(gene.vector)<-ALL.vector#set names
#weight gene vector by bias for length of gene
pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall<-goseq(pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:BP", "GO:MF", "GO:CC"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#Filtering for p < 0.05
GO_05_black <- GO %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue) %>% mutate(Module = "Black")
go_results_BP <- GO_05_black %>%
filter(ontology=="BP")%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
dim(go_results_BP)
## [1] 395 8
Top10Plot <- go_results_BP %>%
top_n(10, wt=-over_represented_pvalue) %>%
mutate(hitsPerc=numDEInCat/numInCat) %>%
ggplot(aes(x=hitsPerc,
y=reorder(term, -hitsPerc),
colour=over_represented_pvalue,
size=numDEInCat)) +
geom_point() +
scale_colour_gradient(low = "red", high = "blue", name = "p value",guide=guide_colorbar(reverse=TRUE)) +
expand_limits(x=0) +
scale_y_discrete(labels = label_wrap(40)) +
labs(title = "Top 10 Enriched Biological Process GO Terms, Black Module", x="Proportion of Genes in Module per GO Term", y="GO term", colour="p value", size="Count") +
theme_bw() +
theme(axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 10))
Top10Plot
ggsave("../../output/WGCNA/GO_analysis/Top10_BP_by_mod/Black.pdf", Top10Plot, width = 5, height = 5, units = c("in"), dpi=300, limitsize=FALSE)
Collapsing with simplifyEnrichment package
library(simplifyEnrichment)
BP_terms <- go_results_BP$GOterm
Mat <- GO_similarity(BP_terms, ont = "BP", db = "org.Ce.eg.db")
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_up_mods_GOSEM_black.pdf", width = 7, height = 5)
simplifyGO(Mat, word_cloud_grob_param = list(max_width = 50), min_term = 3, exclude_words = c("regulation", "process"), order_by_size = TRUE)
## Cluster 395 terms by 'binary_cut'... 281 clusters, used 3.360521 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 8 GO lists...
dev.off()
## quartz_off_screen
## 2
ID.vector <- geneInfo %>%
filter(moduleColor %in% c("pink")) %>%
pull(gene_id)
length(ID.vector) #220
## [1] 220
##Get a list of GO Terms for the all 9012 genes
GO.terms <- geneInfo %>%
dplyr::select(gene_id, GOs) %>% rename(GOs = "GO.terms")
dim(GO.terms) #9012 2
## [1] 9012 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$GO.terms), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "GO.terms")
GO.terms<-split2
##Construct list of genes with 1 for genes in module and 0 for genes not in the module
gene.vector=as.integer(ALL.vector %in% ID.vector)
names(gene.vector)<-ALL.vector#set names
#weight gene vector by bias for length of gene
pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall<-goseq(pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:BP", "GO:MF", "GO:CC"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#Filtering for p < 0.05
GO_05_pink <- GO %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue) %>% mutate(Module = "Pink")
go_results_BP <- GO_05_pink %>%
filter(ontology=="BP")%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
dim(go_results_BP)
## [1] 234 8
Top10Plot <- go_results_BP %>%
top_n(10, wt=-over_represented_pvalue) %>%
mutate(hitsPerc=numDEInCat/numInCat) %>%
ggplot(aes(x=hitsPerc,
y=reorder(term, -hitsPerc),
colour=over_represented_pvalue,
size=numDEInCat)) +
geom_point() +
scale_colour_gradient(low = "red", high = "blue", name = "p value",guide=guide_colorbar(reverse=TRUE)) +
expand_limits(x=0) +
scale_y_discrete(labels = label_wrap(40)) +
labs(title = "Top 10 Enriched Biological Process GO Terms, Pink Module", x="Proportion of Genes in Module per GO Term", y="GO term", colour="p value", size="Count") +
theme_bw() +
theme(axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 10))
Top10Plot
ggsave("../../output/WGCNA/GO_analysis/Top10_BP_by_mod/Pink.pdf", Top10Plot, width = 5, height = 5, units = c("in"), dpi=300, limitsize=FALSE)
Collapsing with simplifyEnrichment package
library(simplifyEnrichment)
BP_terms <- go_results_BP$GOterm
Mat <- GO_similarity(BP_terms, ont = "BP", db = "org.Ce.eg.db")
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_up_mods_GOSEM_pink.pdf", width = 7, height = 5)
simplifyGO(Mat, word_cloud_grob_param = list(max_width = 50), min_term = 3, exclude_words = c("regulation", "process"), order_by_size = TRUE)
## Cluster 234 terms by 'binary_cut'... 125 clusters, used 0.6260972 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 6 GO lists...
dev.off()
## quartz_off_screen
## 2
ID.vector <- geneInfo %>%
filter(moduleColor %in% c("salmon")) %>%
pull(gene_id)
length(ID.vector) #154
## [1] 154
##Get a list of GO Terms for the all 9012 genes
GO.terms <- geneInfo %>%
dplyr::select(gene_id, GOs) %>% rename(GOs = "GO.terms")
dim(GO.terms) #9012 2
## [1] 9012 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$GO.terms), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "GO.terms")
GO.terms<-split2
##Construct list of genes with 1 for genes in module and 0 for genes not in the module
gene.vector=as.integer(ALL.vector %in% ID.vector)
names(gene.vector)<-ALL.vector#set names
#weight gene vector by bias for length of gene
pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall<-goseq(pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:BP", "GO:MF", "GO:CC"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#Filtering for p < 0.05
GO_05_salmon <- GO %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue) %>% mutate(Module = "Salmon")
go_results_BP <- GO_05_salmon %>%
filter(ontology=="BP")%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
dim(go_results_BP)
## [1] 198 8
Top10Plot <- go_results_BP %>%
top_n(10, wt=-over_represented_pvalue) %>%
mutate(hitsPerc=numDEInCat/numInCat) %>%
ggplot(aes(x=hitsPerc,
y=reorder(term, -hitsPerc),
colour=over_represented_pvalue,
size=numDEInCat)) +
geom_point() +
scale_colour_gradient(low = "red", high = "blue", name = "p value",guide=guide_colorbar(reverse=TRUE)) +
expand_limits(x=0) +
scale_y_discrete(labels = label_wrap(40)) +
labs(title = "Top 10 Enriched Biological Process GO Terms, Salmon Module", x="Proportion of Genes in Module per GO Term", y="GO term", colour="p value", size="Count") +
theme_bw() +
theme(axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 10))
Top10Plot
ggsave("../../output/WGCNA/GO_analysis/Top10_BP_by_mod/Salmon.pdf", Top10Plot, width = 5, height = 5, units = c("in"), dpi=300, limitsize=FALSE)
Collapsing with simplifyEnrichment package
library(simplifyEnrichment)
BP_terms <- go_results_BP$GOterm
Mat <- GO_similarity(BP_terms, ont = "BP", db = "org.Ce.eg.db")
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_up_mods_GOSEM_salmon.pdf", width = 7, height = 5)
simplifyGO(Mat, word_cloud_grob_param = list(max_width = 50), min_term = 3, exclude_words = c("regulation", "process"), order_by_size = TRUE)
## Cluster 198 terms by 'binary_cut'... 79 clusters, used 0.9675319 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 10 GO lists...
dev.off()
## quartz_off_screen
## 2
ID.vector <- geneInfo %>%
filter(moduleColor %in% c("blue")) %>%
pull(gene_id)
length(ID.vector) #1989
## [1] 1989
##Get a list of GO Terms for the all 9012 genes
GO.terms <- geneInfo %>%
dplyr::select(gene_id, GOs) %>% rename(GOs = "GO.terms")
dim(GO.terms) #9012 2
## [1] 9012 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$GO.terms), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "GO.terms")
GO.terms<-split2
##Construct list of genes with 1 for genes in module and 0 for genes not in the module
gene.vector=as.integer(ALL.vector %in% ID.vector)
names(gene.vector)<-ALL.vector#set names
#weight gene vector by bias for length of gene
pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector)
## Warning in pcls(G): initial point very close to some inequality constraints
#run goseq using Wallenius method for all categories of GO terms
GO.wall<-goseq(pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:BP", "GO:MF", "GO:CC"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#Filtering for p < 0.05
GO_05_blue <- GO %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue) %>% mutate(Module = "Blue")
go_results_BP <- GO_05_blue %>%
filter(ontology=="BP")%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
dim(go_results_BP)
## [1] 1530 8
Top10Plot <- go_results_BP %>%
top_n(10, wt=-over_represented_pvalue) %>%
mutate(hitsPerc=numDEInCat/numInCat) %>%
ggplot(aes(x=hitsPerc,
y=reorder(term, -hitsPerc),
colour=over_represented_pvalue,
size=numDEInCat)) +
geom_point() +
scale_colour_gradient(low = "red", high = "blue", name = "p value",guide=guide_colorbar(reverse=TRUE)) +
expand_limits(x=0) +
scale_y_discrete(labels = label_wrap(40)) +
labs(title = "Top 10 Enriched Biological Process GO Terms, Blue Module", x="Proportion of Genes in Module per GO Term", y="GO term", colour="p value", size="Count") +
theme_bw() +
theme(axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 10))
Top10Plot
ggsave("../../output/WGCNA/GO_analysis/Top10_BP_by_mod/Blue.pdf", Top10Plot, width = 5, height = 5, units = c("in"), dpi=300, limitsize=FALSE)
Collapsing with simplifyEnrichment package
library(simplifyEnrichment)
BP_terms <- go_results_BP$GOterm
Mat <- GO_similarity(BP_terms, ont = "BP", db = "org.Ce.eg.db")
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_up_mods_GOSEM_blue.pdf", width = 7, height = 5)
simplifyGO(Mat, word_cloud_grob_param = list(max_width = 50))
## Cluster 1530 terms by 'binary_cut'... 419 clusters, used 16.64123 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 7 GO lists...
dev.off()
## quartz_off_screen
## 2
GO_05_UP <- rbind(GO_05_brown,GO_05_red,GO_05_black,GO_05_pink,GO_05_salmon,GO_05_blue)
length(GO_05_UP$GOterm) #3810 enriched GO terms
## [1] 3810
length(unique(GO_05_UP$GOterm)) #3617 enriched GO terms
## [1] 3617
# Collapse GO terms that are duplicated accross modules - sort by GO term and p-value, then keep the row with the lowest p-value, collapsing the module names so you know which modules were represented
# Identify and collapse duplicates
GO_05_UP <- GO_05_UP %>%
arrange(GOterm, over_represented_pvalue) %>%
group_by(GOterm) %>% mutate(Module = paste(unique(Module), collapse = ";")) %>%
group_by(GOterm) %>%
slice_min(order_by = over_represented_pvalue, n = 1) %>%
ungroup()
length(GO_05_UP$GOterm) #3617 enriched GO terms
## [1] 3617
length(unique(GO_05_UP$GOterm)) #3617 enriched GO terms, all are unique
## [1] 3617
#Write file of results
write.csv(GO_05_UP, file = "../../output/WGCNA/GO_analysis/goseq_pattern_calcification_up_bymod.csv")
2842 genes are in the 4 modules significantly downregulated by calcification.
### Generate vector with names of all genes
ALL.vector <- c(geneInfo$gene_id)
### Generate length vector for all genes
LENGTH.vector <- as.integer(geneInfo$Length)
### Generate vector with names in just the module we are analyzing
ID.vector <- geneInfo%>%
filter(moduleColor %in% c("turquoise","magenta","lightcyan"))%>%
pull(gene_id)
length(ID.vector) #2842
## [1] 2842
##Get a list of GO Terms for the all 9012 genes
GO.terms <- geneInfo %>%
dplyr::select(gene_id, GOs) %>% rename(GOs = "GO.terms")
dim(GO.terms) #9012 2
## [1] 9012 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$GO.terms), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "GO.terms")
GO.terms<-split2
##Construct list of genes with 1 for genes in module and 0 for genes not in the module
gene.vector=as.integer(ALL.vector %in% ID.vector)
names(gene.vector)<-ALL.vector#set names
#weight gene vector by bias for length of gene
pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector)
### Run the GOSeq function by module color to test for GO term
enrichment. Due to high number of enriched GO terms, I am using a
p-value threshold of <0.05 and using
rrvgo package to
reduce redundancy in list of GO terms.
#run goseq using Wallenius method for all categories of GO terms
GO.wall<-goseq(pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:BP", "GO:MF", "GO:CC"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#Filtering for p < 0.05
GO_05 <- GO %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue)
#Write file of results
write.csv(GO_05, file = "../../output/WGCNA/GO_analysis/goseq_pattern_calcification_down.csv")
GO_05 <-read.csv("../../output/WGCNA/GO_analysis/goseq_pattern_calcification_down.csv")
go_results_BP <- GO_05 %>%
filter(ontology=="BP") %>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat > 10) %>%
arrange(., over_represented_pvalue)
dim(go_results_BP)
## [1] 298 8
head(go_results_BP)
## X GOterm over_represented_pvalue under_represented_pvalue numDEInCat
## 1 1 GO:0002181 8.295294e-15 1 67
## 2 2 GO:0006614 2.463649e-13 1 48
## 3 3 GO:0006412 3.096109e-13 1 123
## 4 4 GO:0043043 1.598313e-12 1 126
## 5 5 GO:0006613 2.273026e-12 1 49
## 6 6 GO:0006518 7.543937e-12 1 146
## numInCat term ontology
## 1 90 cytoplasmic translation BP
## 2 59 SRP-dependent cotranslational protein targeting to membrane BP
## 3 219 translation BP
## 4 231 peptide biosynthetic process BP
## 5 63 cotranslational protein targeting to membrane BP
## 6 285 peptide metabolic process BP
Collapsing with simplifyEnrichment package
library(simplifyEnrichment)
BP_terms <- go_results_BP$GOterm
Mat <- GO_similarity(BP_terms, ont = "BP", db = "org.Ce.eg.db")
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_down_mods_GOSEM.pdf", width = 7, height = 5)
simplifyGO(Mat, word_cloud_grob_param = list(max_width = 50), max_words=20)
## Cluster 298 terms by 'binary_cut'... 87 clusters, used 0.6110082 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 4 GO lists...
dev.off()
## quartz_off_screen
## 2
CalcDownMods_GO_P05 <- go_results_BP %>%
mutate(term = fct_reorder(term, over_represented_pvalue, .desc = TRUE)) %>%
ggplot(aes(x=term, y=over_represented_pvalue) ) +
geom_segment(aes(x=term ,xend=term, y=0, yend=over_represented_pvalue), color="grey") +
geom_point(size=1, color="#69b3a2") +
coord_flip() +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
legend.position="none"
) +
xlab("") +
ylab("over_represented_pvalue") +
ggtitle("Biological Process GO Terms Enriched in Modules Downregulated in Calcification") + #add a main title
theme(plot.title = element_text(face = 'bold',
size = 3,
hjust = 0)) +
theme_bw() + #Set background color
theme(panel.border = element_blank(), # Set border
panel.grid.major = element_blank(), #Set major gridlines
panel.grid.minor = element_blank(), #Set minor gridlines
axis.line = element_line(colour = "black"), #Set axes color
axis.text.y = element_text(size = 2), #set y axis labels size
plot.background=element_blank(),#Set the plot background
legend.position="none")
CalcDownMods_GO_P05
ggsave("../../output/WGCNA/GO_analysis/CalcDownMods_GO_P05.pdf", CalcDownMods_GO_P05, width = 12, height = 36, units = c("in"), dpi=300, limitsize=FALSE)
library(rrvgo)
#Reduce/collapse GO term set with the rrvgo package
simMatrix <- calculateSimMatrix(go_results_BP$GOterm,
orgdb="org.Ce.eg.db", #c. elegans database
ont="BP",
method="Rel")
## preparing gene to GO mapping data...
## preparing IC data...
#calculate similarity
scores <- setNames(-log(go_results_BP$over_represented_pvalue), go_results_BP$GOterm)
reducedTerms <- reduceSimMatrix(simMatrix,
scores,
threshold=0.7,
orgdb="org.Ce.eg.db")
dim(reducedTerms)
## [1] 226 10
#keep only the goterms from the reduced list
go_results_BP_reduced <- go_results_BP %>%
filter(GOterm %in% reducedTerms$go)
#add in parent terms to list of go terms
go_results_BP_reduced$ParentTerm <- reducedTerms$parentTerm[match(go_results_BP_reduced$GOterm, reducedTerms$go)]
length(unique(go_results_BP_reduced$ParentTerm))
## [1] 38
The reduced list of terms is 226 terms that falls under 38 parent terms.
write.csv(go_results_BP_reduced, "../../output/WGCNA/GO_analysis/goseq_pattern_calcification_down_filtered.csv")
#plot significantly enriched GO terms by Slim Category faceted by slim term
GO.plot_calcification_down <- ggplot(go_results_BP_reduced, aes(x = ontology, y = term)) +
geom_point(aes(size=over_represented_pvalue)) +
scale_size(name="Over rep. p-value", trans="reverse", range=c(1,3))+
facet_grid(ParentTerm ~ ., scales = "free", labeller = label_wrap_gen(width = 5, multi_line = TRUE))+
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
strip.text.y = element_text(angle=0, size = 10),
strip.text.x = element_text(size = 20),
axis.text = element_text(size = 8),
axis.title.x = element_blank(),
axis.title.y = element_blank())
GO.plot_calcification_down
ggsave(filename="../../output/WGCNA/GO_analysis/GO.plot_calcification_down.pdf", plot=GO.plot_calcification_down, dpi=300, height=100, units="in", limitsize=FALSE)
## Saving 7 x 100 in image
CalcDownMods_GO_P05_reduced <- go_results_BP_reduced %>%
mutate(term = fct_reorder(term, over_represented_pvalue, .desc = TRUE)) %>%
ggplot(aes(x=term, y=over_represented_pvalue) ) +
geom_segment(aes(x=term ,xend=term, y=0, yend=over_represented_pvalue), color="grey") +
geom_point(size=1, color="#69b3a2") +
coord_flip() +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
legend.position="none"
) +
xlab("") +
ylab("over_represented_pvalue") +
ggtitle("Biological Process GO Terms Enriched in Modules Downregulated in Calcification") + #add a main title
theme(plot.title = element_text(face = 'bold',
size = 3,
hjust = 0)) +
theme_bw() + #Set background color
theme(panel.border = element_blank(), # Set border
panel.grid.major = element_blank(), #Set major gridlines
panel.grid.minor = element_blank(), #Set minor gridlines
axis.line = element_line(colour = "black"), #Set axes color
axis.text.y = element_text(size = 2), #set y axis labels size
plot.background=element_blank(),#Set the plot background
legend.position="none")
CalcDownMods_GO_P05_reduced
ggsave("../../output/WGCNA/GO_analysis/CalcDownMods_GO_P05_reduced.pdf", CalcDownMods_GO_P05_reduced, width = 12, height = 36, units = c("in"), dpi=300, limitsize=FALSE)
### Generate vector with names of all genes
ALL.vector <- c(geneInfo$gene_id)
### Generate length vector for all genes
LENGTH.vector <- as.integer(geneInfo$Length)
ID.vector <- geneInfo %>%
filter(moduleColor %in% c("turquoise")) %>%
pull(gene_id)
length(ID.vector) #2558
## [1] 2558
##Get a list of GO Terms for the all 9012 genes
GO.terms <- geneInfo %>%
dplyr::select(gene_id, GOs) %>% rename(GOs = "GO.terms")
dim(GO.terms) #9012 2
## [1] 9012 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$GO.terms), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "GO.terms")
GO.terms<-split2
##Construct list of genes with 1 for genes in module and 0 for genes not in the module
gene.vector=as.integer(ALL.vector %in% ID.vector)
names(gene.vector)<-ALL.vector #set names
#weight gene vector by bias for length of gene
pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector)
#run goseq using Wallenius method for all categories of GO terms
GO.wall<-goseq(pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:BP", "GO:MF", "GO:CC"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#Filtering for p < 0.05
GO_05_turquoise <- GO %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue) %>% mutate(Module = "Turquoise")
go_results_BP <- GO_05_turquoise %>%
filter(ontology=="BP")%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
dim(go_results_BP)
## [1] 382 8
Top10Plot <- go_results_BP %>%
top_n(10, wt=-over_represented_pvalue) %>%
mutate(hitsPerc=numDEInCat/numInCat) %>%
ggplot(aes(x=hitsPerc,
y=reorder(term, -hitsPerc),
colour=over_represented_pvalue,
size=numDEInCat)) +
geom_point() +
scale_colour_gradient(low = "red", high = "blue", name = "p value",guide=guide_colorbar(reverse=TRUE)) +
expand_limits(x=0) +
scale_y_discrete(labels = label_wrap(40)) +
labs(title = "Top 10 Enriched Biological Process GO Terms, Turquoise Module", x="Proportion of Genes in Module per GO Term", y="GO term", colour="p value", size="Count") +
theme_bw() +
theme(axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 10))
Top10Plot
ggsave("../../output/WGCNA/GO_analysis/Top10_BP_by_mod/Turquoise.pdf", Top10Plot, width = 5, height = 5, units = c("in"), dpi=300, limitsize=FALSE)
Collapsing with simplifyEnrichment package
library(simplifyEnrichment)
BP_terms <- go_results_BP$GOterm
Mat <- GO_similarity(BP_terms, ont = "BP", db = "org.Ce.eg.db")
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_down_mods_GOSEM_turquoise.pdf", width = 7, height = 5)
simplifyGO(Mat, word_cloud_grob_param = list(max_width = 50), min_term = 3, exclude_words = c("regulation", "process"), order_by_size = TRUE)
## Cluster 382 terms by 'binary_cut'... 115 clusters, used 1.127246 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 8 GO lists...
dev.off()
## quartz_off_screen
## 2
ID.vector <- geneInfo %>%
filter(moduleColor %in% c("magenta")) %>%
pull(gene_id)
length(ID.vector) #219
## [1] 219
##Get a list of GO Terms for the all 9012 genes
GO.terms <- geneInfo %>%
dplyr::select(gene_id, GOs) %>% rename(GOs = "GO.terms")
dim(GO.terms) #9012 2
## [1] 9012 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$GO.terms), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "GO.terms")
GO.terms<-split2
##Construct list of genes with 1 for genes in module and 0 for genes not in the module
gene.vector=as.integer(ALL.vector %in% ID.vector)
names(gene.vector)<-ALL.vector#set names
#weight gene vector by bias for length of gene
pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector)
## Warning in pcls(G): initial point very close to some inequality constraints
#run goseq using Wallenius method for all categories of GO terms
GO.wall<-goseq(pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:BP", "GO:MF", "GO:CC"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#Filtering for p < 0.05
GO_05_magenta <- GO %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue) %>% mutate(Module = "Magenta")
go_results_BP <- GO_05_magenta %>%
filter(ontology=="BP")%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
dim(go_results_BP)
## [1] 210 8
Top10Plot <- go_results_BP %>%
top_n(10, wt=-over_represented_pvalue) %>%
mutate(hitsPerc=numDEInCat/numInCat) %>%
ggplot(aes(x=hitsPerc,
y=reorder(term, -hitsPerc),
colour=over_represented_pvalue,
size=numDEInCat)) +
geom_point() +
scale_colour_gradient(low = "red", high = "blue", name = "p value",guide=guide_colorbar(reverse=TRUE)) +
expand_limits(x=0) +
scale_y_discrete(labels = label_wrap(40)) +
labs(title = "Top 10 Enriched Biological Process GO Terms, Magenta Module", x="Proportion of Genes in Module per GO Term", y="GO term", colour="p value", size="Count") +
theme_bw() +
theme(axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 10))
Top10Plot
ggsave("../../output/WGCNA/GO_analysis/Top10_BP_by_mod/Magenta.pdf", Top10Plot, width = 5, height = 5, units = c("in"), dpi=300, limitsize=FALSE)
Collapsing with simplifyEnrichment package
library(simplifyEnrichment)
BP_terms <- go_results_BP$GOterm
Mat <- GO_similarity(BP_terms, ont = "BP", db = "org.Ce.eg.db")
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_down_mods_GOSEM_magenta.pdf", width = 7, height = 5)
result <- simplifyGO(Mat, word_cloud_grob_param = list(max_width = 50), min_term = 3, exclude_words = c("regulation", "process"), order_by_size = TRUE)
## Cluster 210 terms by 'binary_cut'... 147 clusters, used 0.8285391 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 7 GO lists...
result
## id cluster
## 1 GO:0019722 1
## 2 GO:0007267 1
## 3 GO:0033292 2
## 4 GO:0070296 3
## 5 GO:0017158 4
## 6 GO:0008584 5
## 7 GO:0046546 5
## 8 GO:0017157 4
## 9 GO:0014733 6
## 10 GO:0014883 7
## 11 GO:0086015 8
## 12 GO:0086018 9
## 13 GO:0086070 10
## 14 GO:0007268 1
## 15 GO:0098916 1
## 16 GO:0002027 11
## 17 GO:0099537 1
## 18 GO:0099536 1
## 19 GO:0071286 12
## 20 GO:1903305 4
## 21 GO:2000833 13
## 22 GO:0019932 1
## 23 GO:0045838 14
## 24 GO:0086014 15
## 25 GO:0086019 16
## 26 GO:0086026 17
## 27 GO:0086066 18
## 28 GO:0007436 19
## 29 GO:0086091 20
## 30 GO:0098901 21
## 31 GO:1903530 4
## 32 GO:0006376 22
## 33 GO:0051560 23
## 34 GO:0099623 24
## 35 GO:1901018 4
## 36 GO:0030001 4
## 37 GO:0006816 4
## 38 GO:0055074 23
## 39 GO:0086036 25
## 40 GO:0032471 26
## 41 GO:0046883 4
## 42 GO:1905954 4
## 43 GO:0090649 27
## 44 GO:0090650 28
## 45 GO:0086004 29
## 46 GO:1903115 30
## 47 GO:2000831 31
## 48 GO:0051561 23
## 49 GO:0051046 4
## 50 GO:0035260 32
## 51 GO:2000834 33
## 52 GO:2000836 34
## 53 GO:0033563 5
## 54 GO:0021858 35
## 55 GO:0048581 5
## 56 GO:0002168 36
## 57 GO:0060561 37
## 58 GO:1903553 38
## 59 GO:0010960 23
## 60 GO:0050906 5
## 61 GO:0045955 39
## 62 GO:0097711 40
## 63 GO:0010817 14
## 64 GO:0001915 41
## 65 GO:0010615 42
## 66 GO:0014744 43
## 67 GO:1903244 44
## 68 GO:1903799 22
## 69 GO:0014059 4
## 70 GO:0010862 1
## 71 GO:0002381 45
## 72 GO:1901381 4
## 73 GO:1903859 5
## 74 GO:1903861 5
## 75 GO:0008016 46
## 76 GO:0048791 4
## 77 GO:0032026 47
## 78 GO:1903532 4
## 79 GO:0086065 48
## 80 GO:0008406 5
## 81 GO:0045137 5
## 82 GO:0044060 49
## 83 GO:0000389 22
## 84 GO:0061067 5
## 85 GO:0032370 4
## 86 GO:0051412 50
## 87 GO:0019228 14
## 88 GO:0060306 51
## 89 GO:0003299 52
## 90 GO:0014887 53
## 91 GO:0014898 54
## 92 GO:0007619 55
## 93 GO:0006874 23
## 94 GO:0010543 56
## 95 GO:0009584 57
## 96 GO:0043972 58
## 97 GO:0042713 5
## 98 GO:0043084 59
## 99 GO:0060011 60
## 100 GO:0051047 4
## 101 GO:0043379 61
## 102 GO:0090713 62
## 103 GO:0090715 63
## 104 GO:1902349 64
## 105 GO:1902350 65
## 106 GO:1903939 66
## 107 GO:1904515 67
## 108 GO:1903306 4
## 109 GO:0019082 68
## 110 GO:0032804 69
## 111 GO:0032898 70
## 112 GO:0032902 71
## 113 GO:0032911 72
## 114 GO:0071635 73
## 115 GO:0090472 74
## 116 GO:0006812 4
## 117 GO:0002159 75
## 118 GO:0018894 76
## 119 GO:0030845 77
## 120 GO:0032425 78
## 121 GO:0042488 79
## 122 GO:0050930 80
## 123 GO:1990911 81
## 124 GO:2000705 82
## 125 GO:2000707 83
## 126 GO:0018401 84
## 127 GO:0019511 84
## 128 GO:1905289 85
## 129 GO:1905290 86
## 130 GO:0016331 5
## 131 GO:0036124 84
## 132 GO:0031446 87
## 133 GO:0031448 88
## 134 GO:0032470 89
## 135 GO:0051659 90
## 136 GO:0090076 91
## 137 GO:0098909 92
## 138 GO:1901896 93
## 139 GO:1903515 94
## 140 GO:1990036 95
## 141 GO:0007043 22
## 142 GO:0032274 96
## 143 GO:0032275 97
## 144 GO:0006813 4
## 145 GO:0045822 98
## 146 GO:0032469 23
## 147 GO:0050853 99
## 148 GO:0050962 100
## 149 GO:0051567 84
## 150 GO:0002710 101
## 151 GO:0051050 4
## 152 GO:0007435 102
## 153 GO:0030217 103
## 154 GO:1903779 104
## 155 GO:0034105 105
## 156 GO:0086002 106
## 157 GO:0034764 4
## 158 GO:0010650 107
## 159 GO:0034112 108
## 160 GO:0036309 109
## 161 GO:0036371 110
## 162 GO:0086046 111
## 163 GO:0098904 112
## 164 GO:0098907 113
## 165 GO:0098910 114
## 166 GO:1900827 115
## 167 GO:0042492 116
## 168 GO:0046629 117
## 169 GO:1990134 118
## 170 GO:0034110 119
## 171 GO:0035561 120
## 172 GO:0014888 121
## 173 GO:0032414 4
## 174 GO:0051385 122
## 175 GO:0001803 123
## 176 GO:0001805 124
## 177 GO:0001810 125
## 178 GO:0001812 126
## 179 GO:0002343 127
## 180 GO:0002344 128
## 181 GO:0002721 129
## 182 GO:0002883 130
## 183 GO:0002885 131
## 184 GO:0046960 132
## 185 GO:0061337 133
## 186 GO:0060047 134
## 187 GO:0002709 135
## 188 GO:0046958 136
## 189 GO:0040018 5
## 190 GO:0048312 90
## 191 GO:0060393 137
## 192 GO:0010882 138
## 193 GO:0046661 5
## 194 GO:0001666 57
## 195 GO:1901021 4
## 196 GO:0086003 139
## 197 GO:0061647 84
## 198 GO:0006937 5
## 199 GO:0086010 14
## 200 GO:0043267 4
## 201 GO:0032803 140
## 202 GO:0045478 141
## 203 GO:0006407 142
## 204 GO:0050848 1
## 205 GO:0001911 143
## 206 GO:0031342 144
## 207 GO:0046887 4
## 208 GO:0070884 145
## 209 GO:0106056 146
## 210 GO:0003015 147
dev.off()
## quartz_off_screen
## 2
# Following filtering of small clusters is based on the simplifyGO/ht_clusters source code for merging the small clusters together
# Filter clusters smaller than 3
cluster_sizes <- table(result$cluster) # table function makes a table with the size of each cluster (freqeuncy)
large_clusters <- as.numeric(names(cluster_sizes)[cluster_sizes >= 3]) # Identify clusters with frequenct/size less than 3
filtered_indices <- result$cluster %in% large_clusters # Filter out the indices belonging to small clusters
filtered_Mat <- Mat[filtered_indices, filtered_indices] # Create a new matrix with only the indicies from larger clusters
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_down_mods_GOSEM_magenta_filtered.pdf", width = 7, height = 5)
simplifyGO(filtered_Mat, word_cloud_grob_param = list(max_width = 50), min_term = 1, exclude_words = c("regulation", "process"), order_by_size = TRUE)
## Cluster 68 terms by 'binary_cut'... 9 clusters, used 0.04797482 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 9 GO lists...
dev.off()
## quartz_off_screen
## 2
ID.vector <- geneInfo %>%
filter(moduleColor %in% c("lightcyan")) %>%
pull(gene_id)
length(ID.vector) #65
## [1] 65
##Get a list of GO Terms for the all 9012 genes
GO.terms <- geneInfo %>%
dplyr::select(gene_id, GOs) %>% rename(GOs = "GO.terms")
dim(GO.terms) #9012 2
## [1] 9012 2
##Format to have one goterm per row with gene ID repeated
split <- strsplit(as.character(GO.terms$GO.terms), ";")
split2 <- data.frame(v1 = rep.int(GO.terms$gene, sapply(split, length)), v2 = unlist(split))
colnames(split2) <- c("gene", "GO.terms")
GO.terms<-split2
##Construct list of genes with 1 for genes in module and 0 for genes not in the module
gene.vector=as.integer(ALL.vector %in% ID.vector)
names(gene.vector)<-ALL.vector#set names
#weight gene vector by bias for length of gene
pwf<-nullp(gene.vector, ID.vector, bias.data=LENGTH.vector)
## Warning in pcls(G): initial point very close to some inequality constraints
#run goseq using Wallenius method for all categories of GO terms
GO.wall<-goseq(pwf, ID.vector, gene2cat=GO.terms, test.cats=c("GO:BP", "GO:MF", "GO:CC"), method="Wallenius", use_genes_without_cat=TRUE)
## Using manually entered categories.
## Calculating the p-values...
## 'select()' returned 1:1 mapping between keys and columns
GO <- GO.wall[order(GO.wall$over_represented_pvalue),]
colnames(GO)[1] <- "GOterm"
#Filtering for p < 0.05
GO_05_lightcyan <- GO %>%
dplyr::filter(over_represented_pvalue<0.05) %>%
dplyr::arrange(., ontology, over_represented_pvalue) %>% mutate(Module = "Lightcyan")
go_results_BP <- GO_05_lightcyan %>%
filter(ontology=="BP")%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
dim(go_results_BP)
## [1] 129 8
Top10Plot <- go_results_BP %>%
top_n(10, wt=-over_represented_pvalue) %>%
mutate(hitsPerc=numDEInCat/numInCat) %>%
ggplot(aes(x=hitsPerc,
y=reorder(term, -hitsPerc),
colour=over_represented_pvalue,
size=numDEInCat)) +
geom_point() +
scale_colour_gradient(low = "red", high = "blue", name = "p value",guide=guide_colorbar(reverse=TRUE)) +
expand_limits(x=0) +
scale_y_discrete(labels = label_wrap(40)) +
labs(title = "Top 10 Enriched Biological Process GO Terms, Lightcyan Module", x="Proportion of Genes in Module per GO Term", y="GO term", colour="p value", size="Count") +
theme_bw() +
theme(axis.text.y = element_text(size = 8),
axis.title = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 10))
Top10Plot
ggsave("../../output/WGCNA/GO_analysis/Top10_BP_by_mod/Lightcyan.pdf", Top10Plot, width = 5, height = 5, units = c("in"), dpi=300, limitsize=FALSE)
Collapsing with simplifyEnrichment package
library(simplifyEnrichment)
BP_terms <- go_results_BP$GOterm
Mat <- GO_similarity(BP_terms, ont = "BP", db = "org.Ce.eg.db")
pdf(file = "../../output/WGCNA/GO_analysis/semantic_similarity/calc_down_mods_GOSEM_lightcyan.pdf", width = 7, height = 5)
simplifyGO(Mat, word_cloud_grob_param = list(max_width = 50), min_term = 3, exclude_words = c("regulation", "process"), order_by_size = TRUE)
## Cluster 129 terms by 'binary_cut'... 82 clusters, used 0.3009071 secs.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## Perform keywords enrichment for 8 GO lists...
dev.off()
## quartz_off_screen
## 2
GO_05_DOWN <- rbind(GO_05_turquoise,GO_05_magenta,GO_05_lightcyan)
length(GO_05_DOWN$GOterm) #1072 enriched GO terms
## [1] 1072
length(unique(GO_05_DOWN$GOterm)) #1070 enriched GO terms
## [1] 1070
# Collapse GO terms that are duplicated accross modules - sort by GO term and p-value, then keep the row with the lowest p-value, collapsing the module names so you know which modules were represented
# Identify and collapse duplicates
GO_05_DOWN <- GO_05_DOWN %>%
arrange(GOterm, over_represented_pvalue) %>%
group_by(GOterm) %>% mutate(Module = paste(unique(Module), collapse = ";")) %>%
group_by(GOterm) %>%
slice_min(order_by = over_represented_pvalue, n = 1) %>%
ungroup()
#Write file of results
write.csv(GO_05_DOWN, file = "../../output/WGCNA/GO_analysis/goseq_pattern_calcification_down_bymod.csv")
# GO_05_up <- read.csv("../../output/WGCNA/GO_analysis/goseq_pattern_calcification.csv") %>% dplyr::select(-"X")
# GO_05_down <- read.csv("../../output/WGCNA/GO_analysis/goseq_pattern_calcification_down.csv") %>% dplyr::select(-"X")
GO_05_up <- read.csv("../../output/WGCNA/GO_analysis/goseq_pattern_calcification_up_bymod.csv") %>% dplyr::select(-"X")
GO_05_down <- read.csv("../../output/WGCNA/GO_analysis/goseq_pattern_calcification_down_bymod.csv") %>% dplyr::select(-"X")
go_results_BP_up <- GO_05_up %>%
filter(ontology=="BP")%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
dim(go_results_BP_up)
## [1] 2647 8
go_results_BP_down <- GO_05_down %>%
filter(ontology=="BP")%>%
filter(over_represented_pvalue != "NA") %>%
#filter(numInCat>10)%>%
arrange(., over_represented_pvalue)
dim(go_results_BP_down)
## [1] 721 8
all_enriched_terms_up_down <- c(go_results_BP_up$GOterm,go_results_BP_down$GOterm)
length(all_enriched_terms_up_down)
## [1] 3368
length(unique(all_enriched_terms_up_down))
## [1] 3217
By reducing the lists together, we can ensure that each GO term is given the same parent term whether the term was in the Up or Down modules.
library(rrvgo)
#Reduce/collapse GO term set with the rrvgo package
simMatrix <- calculateSimMatrix(unique(all_enriched_terms_up_down),
orgdb="org.Ce.eg.db", #c. elegans database
ont="BP",
method="Rel")
## preparing gene to GO mapping data...
## preparing IC data...
#calculate similarity
reducedTerms <- reduceSimMatrix(simMatrix,
scores = "size",
threshold=0.7,
orgdb="org.Ce.eg.db")
## No scores provided. Falling back to term's GO size
dim(reducedTerms)
## [1] 2004 10
#keep only the goterms from the reduced list
go_results_BP_up_reduced <- go_results_BP_up %>%
filter(GOterm %in% reducedTerms$go)
#add in parent terms to list of go terms
go_results_BP_up_reduced$ParentTerm <- reducedTerms$parentTerm[match(go_results_BP_up_reduced$GOterm, reducedTerms$go)]
length(unique(go_results_BP_up_reduced$ParentTerm))
## [1] 148
#keep only the goterms from the reduced list
go_results_BP_down_reduced <- go_results_BP_down %>%
filter(GOterm %in% reducedTerms$go)
#add in parent terms to list of go terms
go_results_BP_down_reduced$ParentTerm <- reducedTerms$parentTerm[match(go_results_BP_down_reduced$GOterm, reducedTerms$go)]
length(unique(go_results_BP_down_reduced$ParentTerm))
## [1] 85
#write.csv(go_results_BP_up_reduced, "../../output/WGCNA/GO_analysis/goseq_pattern_calcification_filtered.csv")
#write.csv(go_results_BP_down_reduced, "../../output/WGCNA/GO_analysis/goseq_pattern_calcification_down_filtered.csv")
The following list is a list of BP-ontology (“Biological Process”) GO terms that were significantly enriched (P < 0.05) within the modules that were upregulated for calcification. The list has been further reduced by using the package rrvgo.
#cal_up_terms <- read.csv("../../output/WGCNA/GO_analysis/goseq_pattern_calcification_filtered.csv")
cal_up_terms <- go_results_BP_up_reduced
cal_up_terms <- cal_up_terms %>% mutate(Factor = "Up")
head(cal_up_terms)
## GOterm over_represented_pvalue under_represented_pvalue numDEInCat
## 1 GO:0050794 7.575558e-23 1 823
## 2 GO:0065007 9.035173e-22 1 912
## 3 GO:0050789 1.200453e-20 1 859
## 4 GO:0048522 2.736399e-18 1 531
## 5 GO:0048518 2.733516e-17 1 574
## 6 GO:0009987 1.373221e-16 1 1074
## numInCat term ontology Module
## 1 2785 regulation of cellular process BP Blue
## 2 3184 biological regulation BP Blue
## 3 2981 regulation of biological process BP Blue
## 4 1705 positive regulation of cellular process BP Blue
## 5 1888 positive regulation of biological process BP Blue
## 6 4026 cellular process BP Blue
## ParentTerm Factor
## 1 regulation of cellular process Up
## 2 regulation of cellular process Up
## 3 regulation of cellular process Up
## 4 positive regulation of transcription by RNA polymerase II Up
## 5 positive regulation of transcription by RNA polymerase II Up
## 6 cellular process Up
The following list is a list of BP-ontology (“Biological Process”) GO terms that were significantly enriched (P < 0.05) within the modules that were downregulated for calcification. The list has been further reduced by using the package rrvgo.
#cal_down_terms <- read.csv("../../output/WGCNA/GO_analysis/goseq_pattern_calcification_down_filtered.csv")
cal_down_terms <- go_results_BP_down_reduced
cal_down_terms <- cal_down_terms %>% mutate(Factor = "Down")
head(cal_down_terms)
## GOterm over_represented_pvalue under_represented_pvalue numDEInCat
## 1 GO:0002181 1.561228e-16 1 67
## 2 GO:0006412 2.328139e-16 1 123
## 3 GO:0043043 3.132084e-15 1 125
## 4 GO:0006518 5.575430e-15 1 145
## 5 GO:0006614 1.285770e-14 1 48
## 6 GO:0043603 2.192130e-14 1 170
## numInCat term ontology
## 1 90 cytoplasmic translation BP
## 2 219 translation BP
## 3 231 peptide biosynthetic process BP
## 4 285 peptide metabolic process BP
## 5 59 SRP-dependent cotranslational protein targeting to membrane BP
## 6 359 amide metabolic process BP
## Module ParentTerm Factor
## 1 Turquoise translation Down
## 2 Turquoise translation Down
## 3 Turquoise translation Down
## 4 Turquoise translation Down
## 5 Turquoise protein transport Down
## 6 Turquoise translation Down
The following list is a list of BP-ontology (“Biological Process”) GO terms that were significantly enriched (P < 0.05) within the modules that were upregulated or downregulated for calcification. The list has been further reduced by using the package rrvgo.
all_terms <- rbind(cal_down_terms,cal_up_terms)
all_terms <- all_terms[,c("Factor","GOterm","over_represented_pvalue","under_represented_pvalue","numDEInCat","numInCat","term","ontology","ParentTerm","Module")]
all_terms$GOterm<-as.factor(all_terms$GOterm)
dim(all_terms) #2140 reduced terms
## [1] 2140 10
length(unique(all_terms$GOterm)) #this represents 2004 unique terms between the two lists of reduced terms
## [1] 2004
length(unique(all_terms$ParentTerm)) #this represents 153 unique terms between the two lists of reduced terms
## [1] 153
This is collapsing the list from 2236 rows to 2004, representing the 2004 unique GO terms in the list above. It collapses the information in the columns “ParentTerm” and “Factor” so that for each GO term we get the parent terms associated and if this was enriched in modules upregulated for calcification, downregulated for calcification, or both.
goterms_shared <- all_terms %>%
group_by(GOterm) %>%
dplyr::summarise(
ParentTerm = paste(unique(ParentTerm), collapse = ", "),
Factor = paste(unique(Factor), collapse = ", "),
term = paste(unique(term), collapse = ", ")
)
dim(goterms_shared)
## [1] 2004 4
goterms_shared_full <- goterms_shared %>%
left_join(dplyr::select(all_terms,GOterm, Factor, over_represented_pvalue), by = "GOterm") %>% #select 3 columns GOterm, Factor, over_represented_pvalue from all_terms and left join by GOterm, turns this dataframe from 2004 rows back to the 2236 in all_terms
mutate(pvalue_Up = ifelse(Factor.y == "Up", over_represented_pvalue, NA)) %>% #add a column that is the p-value for the Up factor
mutate(pvalue_Down = ifelse(Factor.y == "Down", over_represented_pvalue, NA)) %>% #add a column that is the p-value for the Down factor
dplyr::select(-c("over_represented_pvalue", "Factor.y")) %>% #remove the unique columns so that we can collapse the dataframe
group_by(GOterm, ParentTerm, Factor.x, term) %>% #group by the repeated columns for the non-unique GO terms
dplyr::summarize(pvalue_Up = na.omit(pvalue_Up)[1], #carry over the p-value for the term in the up direction, by taking the first non-NA value.
pvalue_Down = na.omit(pvalue_Down)[1]) %>% #carry over the p-value for the term in the down direction, by taking the first non-NA value.
rename(Factor.x = "Factor") #rename column
## `summarise()` has grouped output by 'GOterm', 'ParentTerm', 'Factor.x'. You can
## override using the `.groups` argument.
write.csv(goterms_shared_full, "../../output/WGCNA/GO_analysis/Merged_GOterms_factor_ParentTerm.csv")
goterms_shared_full <- goterms_shared_full %>% group_by(ParentTerm) %>% mutate("N_in_Parent" = n()) %>% ungroup()
goterms_up <- goterms_shared_full %>% dplyr::filter(Factor=="Up")
goterms_down <- goterms_shared_full %>% dplyr::filter(Factor=="Down")
goterms_shared_only <- goterms_shared_full %>% dplyr::filter(Factor=="Down, Up")
nrow(goterms_up)
## [1] 1571
nrow(goterms_down)
## [1] 297
nrow(goterms_shared_only)
## [1] 136
nrow(goterms_up) + nrow(goterms_down) + nrow(goterms_shared_only) == nrow(goterms_shared_full)
## [1] TRUE
#how many parent terms in each
length(unique(goterms_up$ParentTerm))
## [1] 145
length(unique(goterms_down$ParentTerm))
## [1] 79
length(unique(goterms_shared_only$ParentTerm))
## [1] 42
freq_fig_up_pval <- ggplot(goterms_shared_only, aes(x = reorder(ParentTerm, -N_in_Parent), y = pvalue_Up)) +
geom_segment(aes(x = ParentTerm, xend = ParentTerm, y = 0, yend = pvalue_Up,),linewidth=.25) +
geom_point(size=1.5, alpha=1, colour = "red") +
geom_hline(yintercept = 0, linetype = "solid", color = "grey", size = 0.5, show.legend = TRUE) +
coord_flip() +
theme_classic() +
ylim(0, 0.05) +
theme(axis.text.x=element_text(vjust=0.5, hjust=0.95,size=5),
plot.title = element_text(margin = margin(t = 10, b = 10), hjust=0.5),
panel.background= element_rect(fill=NA, color='black'),
axis.text.y = element_text(vjust=0.5, size=5),
axis.title.x = element_blank(),
axis.title.y = element_blank()) +
guides(color = FALSE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
freq_fig_down_pval <- ggplot(goterms_shared_only, aes(x = reorder(ParentTerm, -N_in_Parent), y = pvalue_Down)) +
geom_segment(aes(x = ParentTerm, xend = ParentTerm, y = 0, yend = pvalue_Down),linewidth=.25) +
geom_point(size=1.5, alpha=1, colour = "blue") +
geom_hline(yintercept = 0, linetype = "solid", color = "grey", size = 0.5, show.legend = TRUE) +
coord_flip() +
theme_classic() +
ylim(0, 0.05) +
theme(axis.text.x=element_text(vjust=0.5, hjust=0.95,size=5),
plot.title = element_text(margin = margin(t = 10, b = 10), hjust=0.5),
panel.background= element_rect(fill=NA, color='black'),
axis.text.y = element_text(vjust=0.5, size=5),
axis.title.x = element_blank(),
axis.title.y = element_blank())
compare_figs<-cowplot::plot_grid(freq_fig_up_pval, freq_fig_down_pval, nrow=1, align="v")
compare_figs
ggsave(filename="../../output/WGCNA/GO_analysis/Up_Down_Shared_ParentTerms.pdf", plot=compare_figs, dpi=300, height=3, units="in", limitsize=FALSE)
## Saving 7 x 3 in image
freq_fig_up_pval <- ggplot(goterms_up, aes(x = reorder(ParentTerm, -N_in_Parent), y = pvalue_Up)) +
geom_segment(aes(x = ParentTerm, xend = ParentTerm, y = 0, yend = pvalue_Up,),linewidth=.25) +
geom_point(size=1.5, alpha=1, colour = "red") +
geom_hline(yintercept = 0, linetype = "solid", color = "grey", size = 0.5, show.legend = TRUE) +
coord_flip() +
theme_classic() +
ylim(0, 0.05) +
theme(axis.text.x=element_text(vjust=0.5, hjust=0.95,size=5),
plot.title = element_text(margin = margin(t = 10, b = 10), hjust=0.5),
panel.background= element_rect(fill=NA, color='black'),
axis.text.y = element_text(vjust=0.5, size=5),
axis.title.x = element_blank(),
axis.title.y = element_blank()) +
guides(color = FALSE)
freq_fig_down_pval <- ggplot(goterms_down, aes(x = reorder(ParentTerm, -N_in_Parent), y = pvalue_Down)) +
geom_segment(aes(x = ParentTerm, xend = ParentTerm, y = 0, yend = pvalue_Down),linewidth=.25) +
geom_point(size=1.5, alpha=1, colour = "blue") +
geom_hline(yintercept = 0, linetype = "solid", color = "grey", size = 0.5, show.legend = TRUE) +
coord_flip() +
theme_classic() +
ylim(0, 0.05) +
theme(axis.text.x=element_text(vjust=0.5, hjust=0.95,size=5),
plot.title = element_text(margin = margin(t = 10, b = 10), hjust=0.5),
panel.background= element_rect(fill=NA, color='black'),
axis.text.y = element_text(vjust=0.5, size=5),
axis.title.x = element_blank(),
axis.title.y = element_blank())
compare_figs<-cowplot::plot_grid(freq_fig_up_pval, freq_fig_down_pval, nrow=1, align="v")
compare_figs
ggsave(filename="../../output/WGCNA/GO_analysis/Up_Down_Unique_ParentTerms.pdf", plot=compare_figs, dpi=300, height=10, units="in", limitsize=FALSE)
## Saving 7 x 10 in image
I downloaded the GOSlim terms list from the following website on 4/5/2024: http://www.informatics.jax.org/gotools/data/input/map2MGIslim.txt
I did so by going to this link and right clicking “here” at “Tab-delimited file mapping GO_ids to MGI GO_Slim category available for download here” and saying “Save link as…” , and I saved it as “map2MGIslim.txt”. I then converted this to CSV and uploaded to this github repository here.
Add slim terms to dataset
GOslim <- read.csv("../../data/map2MGIslim.csv") %>% dplyr::select(-c(term,aspect))
goterms_shared_full_slim <- goterms_shared_full %>%
inner_join(GOslim, by = c("GOterm" = "GO_id"))
goterms_up_slim <- goterms_shared_full_slim %>% dplyr::filter(Factor=="Up" | Factor=="Down, Up")
goterms_up_full_slim_plot <- goterms_up_slim %>%
ggplot(aes(x=pvalue_Up,
y=term)) +
geom_point(size = 0.5) +
expand_limits(x=0) +
facet_grid(GOSlim_bin ~ ., scales = "free", space='free', labeller = label_wrap_gen(width = 55, multi_line = TRUE)) +
labs(x="Over-represented p-value", y="") +
theme_bw() +
theme(
strip.text.y = element_text(size = 5, colour = "black", angle = 0, face = "bold"),
axis.text.y = element_text(size = 1, colour = "black"),
axis.title.x = element_text(size = 5, colour = "black"),
); goterms_up_full_slim_plot
ggsave("../../output/WGCNA/GO_analysis/Calc_Up_GOSlim.pdf", goterms_up_full_slim_plot, width = 5, height = 35)
goterms_up_full_slim_plot <- goterms_up_slim %>%
ggplot(aes(x=log10(pvalue_Up),
y=term)) +
geom_point(size = 0.5) +
expand_limits(x=0) +
facet_grid(GOSlim_bin ~ ., scales = "free", space='free', labeller = label_wrap_gen(width = 55, multi_line = TRUE)) +
labs(x="Log 10(Over-represented p-value)", y="") +
theme_bw() +
theme(
strip.text.y = element_text(size = 5, colour = "black", angle = 0, face = "bold"),
axis.text.y = element_text(size = 1, colour = "black"),
axis.title.x = element_text(size = 5, colour = "black"),
); goterms_up_full_slim_plot
ggsave("../../output/WGCNA/GO_analysis/Calc_Up_GOSlim_Log.pdf", goterms_up_full_slim_plot, width = 5, height = 35)
goterms_down_slim <- goterms_shared_full_slim %>% dplyr::filter(Factor=="Down" | Factor=="Down, Up")
goterms_down_full_slim_plot <- goterms_down_slim %>%
ggplot(aes(x=pvalue_Down,
y=term)) +
geom_point(size = 0.5) +
expand_limits(x=0) +
facet_grid(GOSlim_bin ~ ., scales = "free", space='free', labeller = label_wrap_gen(width = 55, multi_line = TRUE)) +
labs(x="Over-represented p-value", y="") +
theme_bw() +
theme(
strip.text.y = element_text(size = 5, colour = "black", angle = 0, face = "bold"),
axis.text = element_text(size = 1, colour = "black"),
axis.title.x = element_text(size = 5, colour = "black"),
legend.text=element_text(size = 5),
legend.title = element_text(size = 5)
); goterms_down_full_slim_plot
ggsave("../../output/WGCNA/GO_analysis/Calc_Down_GOSlim.pdf", goterms_down_full_slim_plot, width = 5, height = 35)
goterms_down_full_slim_plot <- goterms_down_slim %>%
ggplot(aes(x=log10(pvalue_Down),
y=term)) +
geom_point(size = 0.5) +
expand_limits(x=0) +
facet_grid(GOSlim_bin ~ ., scales = "free", space='free', labeller = label_wrap_gen(width = 55, multi_line = TRUE)) +
labs(x="Log 10(Over-represented p-value)", y="") +
theme_bw() +
theme(
strip.text.y = element_text(size = 5, colour = "black", angle = 0, face = "bold"),
axis.text = element_text(size = 1, colour = "black"),
axis.title.x = element_text(size = 5, colour = "black"),
legend.text=element_text(size = 5),
legend.title = element_text(size = 5)
); goterms_down_full_slim_plot
ggsave("../../output/WGCNA/GO_analysis/Calc_Down_GOSlim_Log.pdf", goterms_down_full_slim_plot, width = 5, height = 35)
SharedGOterms = # of GO terms within the parent term that are in the list for “Down”, “Up”, or “Down, Up”
result_parent_unique <- goterms_shared %>%
group_by(ParentTerm,Factor) %>%
dplyr::summarise(SharedGOterms = n_distinct(GOterm)) %>%
arrange(-SharedGOterms)
## `summarise()` has grouped output by 'ParentTerm'. You can override using the
## `.groups` argument.
parent_filtered_up <- result_parent_unique %>%
dplyr::filter(Factor=="Up" | Factor=="Down, Up")
#dplyr::filter(SharedGOterms>=5)
parent_filtered_down <- result_parent_unique %>%
dplyr::filter(Factor=="Down" | Factor=="Down, Up")
#dplyr::filter(SharedGOterms>=5)
result_up <- cal_up_terms %>%
dplyr::group_by(ParentTerm) %>%
dplyr::summarize(Number.of.terms = n_distinct(term))%>%
mutate(Calcification.direction = "Up")
head(result_up)
## # A tibble: 6 × 3
## ParentTerm Number.of.terms Calcification.direct…¹
## <chr> <int> <chr>
## 1 Arp2/3 complex-mediated actin nucleati… 13 Up
## 2 B cell differentiation 8 Up
## 3 BMP signaling pathway 19 Up
## 4 DNA topological change 7 Up
## 5 G protein-coupled receptor signaling p… 3 Up
## 6 Golgi organization 7 Up
## # ℹ abbreviated name: ¹Calcification.direction
dim(result_up)
## [1] 148 3
merged_up <- parent_filtered_up %>%
tidyr::separate_rows(ParentTerm, sep = ", ") %>%
dplyr::group_by(ParentTerm) %>%
dplyr::summarize(Sum_of_SharedGOterms = sum(SharedGOterms, na.rm = TRUE)) %>%
left_join(result_up, by = "ParentTerm") #join with result from above to get the Number.of.Terms column
merged_up_clean <- na.omit(merged_up)
head(merged_up_clean)
## # A tibble: 6 × 4
## ParentTerm Sum_of_SharedGOterms Number.of.terms Calcification.direct…¹
## <chr> <int> <int> <chr>
## 1 Arp2/3 complex-me… 13 13 Up
## 2 B cell differenti… 8 8 Up
## 3 BMP signaling pat… 19 19 Up
## 4 DNA topological c… 7 7 Up
## 5 G protein-coupled… 3 3 Up
## 6 Golgi organization 7 7 Up
## # ℹ abbreviated name: ¹Calcification.direction
dim(merged_up_clean)
## [1] 147 4
result_down <- cal_down_terms %>%
dplyr::group_by(ParentTerm) %>%
dplyr::summarize(Number.of.terms = n_distinct(term))%>%
mutate(Calcification.direction = "Down")
dim(result_down)
## [1] 85 3
head(result_down)
## # A tibble: 6 × 3
## ParentTerm Number.of.terms Calcification.direct…¹
## <chr> <int> <chr>
## 1 Arp2/3 complex-mediated actin nucleati… 2 Down
## 2 DNA topological change 2 Down
## 3 G protein-coupled receptor signaling p… 2 Down
## 4 IRE1-mediated unfolded protein response 5 Down
## 5 NAD metabolic process 6 Down
## 6 NADH regeneration 1 Down
## # ℹ abbreviated name: ¹Calcification.direction
merged_down <- parent_filtered_down %>%
tidyr::separate_rows(ParentTerm, sep = ", ") %>%
dplyr::group_by(ParentTerm) %>%
dplyr::summarize(Sum_of_SharedGOterms = sum(SharedGOterms, na.rm = TRUE)) %>%
left_join(result_down, by = "ParentTerm")
merged_down_clean<- na.omit(merged_down)
head(merged_down_clean)
## # A tibble: 6 × 4
## ParentTerm Sum_of_SharedGOterms Number.of.terms Calcification.direct…¹
## <chr> <int> <int> <chr>
## 1 Arp2/3 complex-me… 2 2 Down
## 2 DNA topological c… 2 2 Down
## 3 G protein-coupled… 2 2 Down
## 4 IRE1-mediated unf… 5 5 Down
## 5 NAD metabolic pro… 6 6 Down
## 6 NADH regeneration 1 1 Down
## # ℹ abbreviated name: ¹Calcification.direction
cal_freq_terms_filtered_up <- merged_up_clean %>%
filter(Number.of.terms>=10) %>%
filter(Calcification.direction=="Up")
dim(cal_freq_terms_filtered_up)
## [1] 68 4
#counts$Direction.of.flat.origin<- factor(counts$Direction.of.flat.origin, levels =c("up","no pattern","down"))
#counts$Module<- factor(counts$Module, levels=c("Blue","Brown","Greenyellow","Cyan","Pink","Magenta","Lightcyan","Midnight blue","Purple","Turquiose","Red","Black"))
freq_fig_up<-ggplot(cal_freq_terms_filtered_up, aes(y=Number.of.terms,x=reorder(ParentTerm, Number.of.terms), group=1))+
#facet_wrap(~Calcification.direction, nrow = 1)+
geom_point(size=5, alpha=1, pch=21,color="black")+
geom_segment(aes(x=ParentTerm, xend=ParentTerm, y=0, yend=Number.of.terms)) +
geom_hline(yintercept = 0, linetype="solid", color = 'black', size=0.5, show.legend = TRUE)+
coord_flip()+
scale_y_continuous(expression(GO~term~counts),limits=c(0,60))+
#scale_color_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
#scale_fill_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
scale_fill_gradientn(colours=c("white","#fddbc7","#f4a582","#d6604d","#b2182b"), na.value = "grey98",limits = c(0, 100))+
#scale_color_gradientn(colours=c("#b2182b","#fddbc7","white","#d1e5f0","#67a9cf", "#67a9cf", "#2166ac"), na.value = "grey98",limits = c(-0, 40))+
theme_classic()+
theme(axis.text.x=element_text(vjust=0.5, hjust=0.95,size=12),
plot.title = element_text(margin = margin(t = 10, b = 10), hjust=0.5),
panel.background= element_rect(fill=NA, color='black'),
legend.title = element_blank(),
axis.text.y = element_text(vjust=0.5, size=12), #making the axis text larger
axis.title.x = element_blank(),#making the axis title larger
axis.title.y = element_blank(),
strip.text = element_text(size=12))#making the axis title larger
freq_fig_up
ggsave(filename="../../output/WGCNA/GO_analysis/freq_fig_up.pdf", plot=freq_fig_up, dpi=300, height=12, units="in", limitsize=FALSE)
## Saving 7 x 12 in image
cal_freq_terms_filtered_down <- merged_down_clean %>%
filter(Number.of.terms>=10)
#filter(Calcification.direction=="Down")
cal_freq_terms_filtered_down
## # A tibble: 12 × 4
## ParentTerm Sum_of_SharedGOterms Number.of.terms Calcification.direct…¹
## <chr> <int> <int> <chr>
## 1 fatty acid metab… 12 12 Down
## 2 intracellular si… 12 12 Down
## 3 metabolic process 15 15 Down
## 4 negative regulat… 12 12 Down
## 5 peptidyl-serine … 16 16 Down
## 6 protein transport 16 16 Down
## 7 proton motive fo… 26 26 Down
## 8 regulation of pr… 13 13 Down
## 9 regulation of tr… 17 17 Down
## 10 reproduction 13 13 Down
## 11 tricarboxylic ac… 11 11 Down
## 12 ubiquitin-depend… 26 26 Down
## # ℹ abbreviated name: ¹Calcification.direction
freq_fig_down<-ggplot(cal_freq_terms_filtered_down, aes(y=Number.of.terms,x=reorder(ParentTerm, Number.of.terms)))+
#facet_wrap(~Calcification.direction, nrow = 1)+
geom_point(size=5, alpha=1, pch=21,color="black")+
geom_segment(aes(x=ParentTerm, xend=ParentTerm, y=0, yend=Number.of.terms)) +
#geom_hline(yintercept = 0, linetype="solid", color = 'black', size=0.5, show.legend = TRUE)+
coord_flip()+
scale_y_continuous(expression(GO~term~counts),limits=c(0,27))+
#scale_color_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
#scale_fill_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
scale_fill_gradientn(colours=c("white","#d1e5f0","#92c5de","#4393c3","#2166ac"), na.value = "grey98",limits = c(0, 100))+
#scale_fill_gradientn(colours=c("#fddbc7","#f4a582","#d6604d","#b2182b"), na.value = "grey98",limits = c(10, 40))+
#scale_color_gradientn(colours=c("#b2182b","#fddbc7","white","#d1e5f0","#67a9cf", "#67a9cf", "#2166ac"), na.value = "grey98",limits = c(-0, 40))+
theme_classic()+
theme(axis.text.x=element_text(vjust=0.5, hjust=0.95, size=12),
plot.title = element_text(margin = margin(t = 10, b = 10), hjust=0.5),
panel.background= element_rect(fill=NA, color='black'),
legend.title = element_blank(),
axis.text.y = element_text(vjust=0.5, size=12), #making the axis text larger
axis.title.x = element_blank(),#making the axis title larger
axis.title.y = element_blank(),
strip.text = element_text(size=12))#making the axis title larger
freq_fig_down
ggsave(filename="../../output/WGCNA/GO_analysis/freq_fig_down.pdf", plot=freq_fig_down, dpi=300, height=12, units="in", limitsize=FALSE)
## Saving 7 x 12 in image
compare_figs<-cowplot::plot_grid(freq_fig_up, freq_fig_down, nrow=2, align="v")
compare_figs
cal_freq_terms_filtered_up <- merged_up_clean %>%
filter(Number.of.terms>=20) %>%
filter(Calcification.direction=="Up")
cal_freq_terms_filtered_up
## # A tibble: 25 × 4
## ParentTerm Sum_of_SharedGOterms Number.of.terms Calcification.direct…¹
## <chr> <int> <int> <chr>
## 1 IRE1-mediated un… 41 41 Up
## 2 anatomical struc… 20 20 Up
## 3 anatomical struc… 22 22 Up
## 4 cell cycle 57 57 Up
## 5 cell projection … 21 21 Up
## 6 chondroitin sulf… 24 24 Up
## 7 fatty acid metab… 23 23 Up
## 8 innate immune re… 28 28 Up
## 9 intracellular ca… 24 24 Up
## 10 intracellular si… 33 33 Up
## # ℹ 15 more rows
## # ℹ abbreviated name: ¹Calcification.direction
#counts$Direction.of.flat.origin<- factor(counts$Direction.of.flat.origin, levels =c("up","no pattern","down"))
#counts$Module<- factor(counts$Module, levels=c("Blue","Brown","Greenyellow","Cyan","Pink","Magenta","Lightcyan","Midnight blue","Purple","Turquiose","Red","Black"))
freq_fig_up<-ggplot(cal_freq_terms_filtered_up, aes(y=Number.of.terms,x=reorder(ParentTerm, Number.of.terms),group=1))+
#facet_wrap(~Calcification.direction, nrow = 1)+
geom_point(size=5, alpha=1, pch=21,color="black")+
geom_segment(aes(x=ParentTerm, xend=ParentTerm, y=0, yend=Number.of.terms)) +
geom_hline(yintercept = 0, linetype="solid", color = 'black', size=0.5, show.legend = TRUE)+
coord_flip()+
scale_y_continuous(expression(GO~term~counts),limits=c(0,60))+
#scale_color_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
#scale_fill_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
scale_fill_gradientn(colours=c("white","#fddbc7","#f4a582","#d6604d","#b2182b"), na.value = "grey98",limits = c(0, 100))+
#scale_color_gradientn(colours=c("#b2182b","#fddbc7","white","#d1e5f0","#67a9cf", "#67a9cf", "#2166ac"), na.value = "grey98",limits = c(-0, 40))+
theme_classic()+
theme(axis.text.x=element_text(vjust=0.5, hjust=0.95,size=12),
plot.title = element_text(margin = margin(t = 10, b = 10), hjust=0.5),
panel.background= element_rect(fill=NA, color='black'),
legend.title = element_blank(),
axis.text.y = element_text(vjust=0.5, size=12), #making the axis text larger
axis.title.x = element_blank(),#making the axis title larger
axis.title.y = element_blank(),
strip.text = element_text(size=12))#making the axis title larger
freq_fig_up
ggsave(filename="../../output/WGCNA/GO_analysis/freq_fig_up_gr20.pdf", plot=freq_fig_up, dpi=300, height=12, units="in", limitsize=FALSE)
## Saving 7 x 12 in image
cal_freq_terms_filtered_down <- merged_down_clean %>%
filter(Number.of.terms>=20)
#filter(Calcification.direction=="Down")
cal_freq_terms_filtered_down
## # A tibble: 2 × 4
## ParentTerm Sum_of_SharedGOterms Number.of.terms Calcification.direct…¹
## <chr> <int> <int> <chr>
## 1 proton motive for… 26 26 Down
## 2 ubiquitin-depende… 26 26 Down
## # ℹ abbreviated name: ¹Calcification.direction
freq_fig_down<-ggplot(cal_freq_terms_filtered_down, aes(y=Number.of.terms,x=reorder(ParentTerm, Number.of.terms)))+
#facet_wrap(~Calcification.direction, nrow = 1)+
geom_point(size=5, alpha=1, pch=21,color="black")+
geom_segment(aes(x=ParentTerm, xend=ParentTerm, y=0, yend=Number.of.terms)) +
#geom_hline(yintercept = 0, linetype="solid", color = 'black', size=0.5, show.legend = TRUE)+
coord_flip()+
scale_y_continuous(expression(GO~term~counts),limits=c(0,27))+
#scale_color_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
#scale_fill_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
scale_fill_gradientn(colours=c("white","#d1e5f0","#92c5de","#4393c3","#2166ac"), na.value = "grey98",limits = c(0, 100))+
#scale_fill_gradientn(colours=c("#fddbc7","#f4a582","#d6604d","#b2182b"), na.value = "grey98",limits = c(10, 40))+
#scale_color_gradientn(colours=c("#b2182b","#fddbc7","white","#d1e5f0","#67a9cf", "#67a9cf", "#2166ac"), na.value = "grey98",limits = c(-0, 40))+
theme_classic()+
theme(axis.text.x=element_text(vjust=0.5, hjust=0.95, size=12),
plot.title = element_text(margin = margin(t = 10, b = 10), hjust=0.5),
panel.background= element_rect(fill=NA, color='black'),
legend.title = element_blank(),
axis.text.y = element_text(vjust=0.5, size=12), #making the axis text larger
axis.title.x = element_blank(),#making the axis title larger
axis.title.y = element_blank(),
strip.text = element_text(size=12))#making the axis title larger
freq_fig_down
ggsave(filename="../../output/WGCNA/GO_analysis/freq_fig_down_gr20.pdf", plot=freq_fig_down, dpi=300, height=12, units="in", limitsize=FALSE)
## Saving 7 x 12 in image
compare_figs<-cowplot::plot_grid(freq_fig_up, freq_fig_down, nrow=2, align="v")
compare_figs
cal_freq_terms_filtered_up_all <- merged_up_clean %>%
filter(Calcification.direction=="Up")
cal_freq_terms_filtered_up_all
## # A tibble: 147 × 4
## ParentTerm Sum_of_SharedGOterms Number.of.terms Calcification.direct…¹
## <chr> <int> <int> <chr>
## 1 Arp2/3 complex-m… 13 13 Up
## 2 B cell different… 8 8 Up
## 3 BMP signaling pa… 19 19 Up
## 4 DNA topological … 7 7 Up
## 5 G protein-couple… 3 3 Up
## 6 Golgi organizati… 7 7 Up
## 7 IRE1-mediated un… 41 41 Up
## 8 L-ascorbic acid … 3 3 Up
## 9 Mo-molybdopterin… 5 5 Up
## 10 NAD metabolic pr… 4 4 Up
## # ℹ 137 more rows
## # ℹ abbreviated name: ¹Calcification.direction
#counts$Direction.of.flat.origin<- factor(counts$Direction.of.flat.origin, levels =c("up","no pattern","down"))
#counts$Module<- factor(counts$Module, levels=c("Blue","Brown","Greenyellow","Cyan","Pink","Magenta","Lightcyan","Midnight blue","Purple","Turquiose","Red","Black"))
freq_fig_up<-ggplot(cal_freq_terms_filtered_up_all, aes(y=Number.of.terms,x=reorder(ParentTerm, Number.of.terms),group=1))+
#facet_wrap(~Calcification.direction, nrow = 1)+
geom_point(size=5, alpha=1, pch=21,color="black")+
geom_segment(aes(x=ParentTerm, xend=ParentTerm, y=0, yend=Number.of.terms)) +
geom_hline(yintercept = 0, linetype="solid", color = 'black', size=0.5, show.legend = TRUE)+
coord_flip()+
scale_y_continuous(expression(GO~term~counts),limits=c(0,60))+
#scale_color_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
#scale_fill_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
scale_fill_gradientn(colours=c("white","#fddbc7","#f4a582","#d6604d","#b2182b"), na.value = "grey98",limits = c(0, 100))+
#scale_color_gradientn(colours=c("#b2182b","#fddbc7","white","#d1e5f0","#67a9cf", "#67a9cf", "#2166ac"), na.value = "grey98",limits = c(-0, 40))+
theme_classic()+
theme(axis.text.x=element_text(vjust=0.5, hjust=0.95,size=12),
plot.title = element_text(margin = margin(t = 10, b = 10), hjust=0.5),
panel.background= element_rect(fill=NA, color='black'),
legend.title = element_blank(),
axis.text.y = element_text(vjust=0.5, size=12), #making the axis text larger
axis.title.x = element_blank(),#making the axis title larger
axis.title.y = element_blank(),
strip.text = element_text(size=12))#making the axis title larger
freq_fig_up
ggsave(filename="../../output/WGCNA/GO_analysis/freq_fig_up_ALL.pdf", plot=freq_fig_up, dpi=300, height=17, units="in", limitsize=FALSE)
## Saving 7 x 17 in image
cal_freq_terms_filtered_down_all <- merged_down_clean %>%
filter(Calcification.direction=="Down")
cal_freq_terms_filtered_down_all
## # A tibble: 84 × 4
## ParentTerm Sum_of_SharedGOterms Number.of.terms Calcification.direct…¹
## <chr> <int> <int> <chr>
## 1 Arp2/3 complex-m… 2 2 Down
## 2 DNA topological … 2 2 Down
## 3 G protein-couple… 2 2 Down
## 4 IRE1-mediated un… 5 5 Down
## 5 NAD metabolic pr… 6 6 Down
## 6 NADH regeneration 1 1 Down
## 7 P granule assemb… 1 1 Down
## 8 actin cytoskelet… 1 1 Down
## 9 acyl-CoA metabol… 1 1 Down
## 10 anatomical struc… 4 4 Down
## # ℹ 74 more rows
## # ℹ abbreviated name: ¹Calcification.direction
freq_fig_down<-ggplot(cal_freq_terms_filtered_down_all, aes(y=Number.of.terms,x=reorder(ParentTerm, Number.of.terms)))+
#facet_wrap(~Calcification.direction, nrow = 1)+
geom_point(size=5, alpha=1, pch=21,color="black")+
geom_segment(aes(x=ParentTerm, xend=ParentTerm, y=0, yend=Number.of.terms)) +
#geom_hline(yintercept = 0, linetype="solid", color = 'black', size=0.5, show.legend = TRUE)+
coord_flip()+
scale_y_continuous(expression(GO~term~counts),limits=c(0,27))+
#scale_color_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
#scale_fill_manual("Direction.of.flat.origin",values= c("up"="#b2182b","no pattern"="grey","down" ="#67a9cf"))+
scale_fill_gradientn(colours=c("white","#d1e5f0","#92c5de","#4393c3","#2166ac"), na.value = "grey98",limits = c(0, 100))+
#scale_fill_gradientn(colours=c("#fddbc7","#f4a582","#d6604d","#b2182b"), na.value = "grey98",limits = c(10, 40))+
#scale_color_gradientn(colours=c("#b2182b","#fddbc7","white","#d1e5f0","#67a9cf", "#67a9cf", "#2166ac"), na.value = "grey98",limits = c(-0, 40))+
theme_classic()+
theme(axis.text.x=element_text(vjust=0.5, hjust=0.95, size=12),
plot.title = element_text(margin = margin(t = 10, b = 10), hjust=0.5),
panel.background= element_rect(fill=NA, color='black'),
legend.title = element_blank(),
axis.text.y = element_text(vjust=0.5, size=12), #making the axis text larger
axis.title.x = element_blank(),#making the axis title larger
axis.title.y = element_blank(),
strip.text = element_text(size=12))#making the axis title larger
freq_fig_down
ggsave(filename="../../output/WGCNA/GO_analysis/freq_fig_down_ALL.pdf", plot=freq_fig_down, dpi=300, height=17, units="in", limitsize=FALSE)
## Saving 7 x 17 in image
compare_figs<-cowplot::plot_grid(freq_fig_up, freq_fig_down, ncol=2, align="h")
compare_figs